Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 490 results for author: <span class="mathjax">Tan, Z</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Tan%2C+Z">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tan, Z"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tan%2C+Z&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tan, Z"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08745">arXiv:2502.08745</a> <span> [<a href="https://arxiv.org/pdf/2502.08745">pdf</a>, <a href="https://arxiv.org/format/2502.08745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IHEval: Evaluating Language Models on Following the Instruction Hierarchy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xin Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Haoming Jiang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yifan Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zheng Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haodong Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yichuan Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Q">Qingyu Yin</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+B">Bing Yin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08745v1-abstract-short" style="display: inline;"> The instruction hierarchy, which establishes a priority order from system messages to user messages, conversation history, and tool outputs, is essential for ensuring consistent and safe behavior in language models (LMs). Despite its importance, this topic receives limited attention, and there is a lack of comprehensive benchmarks for evaluating models' ability to follow the instruction hierarchy.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08745v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08745v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08745v1-abstract-full" style="display: none;"> The instruction hierarchy, which establishes a priority order from system messages to user messages, conversation history, and tool outputs, is essential for ensuring consistent and safe behavior in language models (LMs). Despite its importance, this topic receives limited attention, and there is a lack of comprehensive benchmarks for evaluating models' ability to follow the instruction hierarchy. We bridge this gap by introducing IHEval, a novel benchmark comprising 3,538 examples across nine tasks, covering cases where instructions in different priorities either align or conflict. Our evaluation of popular LMs highlights their struggle to recognize instruction priorities. All evaluated models experience a sharp performance decline when facing conflicting instructions, compared to their original instruction-following performance. Moreover, the most competitive open-source model only achieves 48% accuracy in resolving such conflicts. Our results underscore the need for targeted optimization in the future development of LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08745v1-abstract-full').style.display = 'none'; document.getElementById('2502.08745v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NAACL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07968">arXiv:2502.07968</a> <span> [<a href="https://arxiv.org/pdf/2502.07968">pdf</a>, <a href="https://arxiv.org/format/2502.07968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Generative Risk Minimization for Out-of-Distribution Generalization on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yaochen Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuxu Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07968v1-abstract-short" style="display: inline;"> Out-of-distribution (OOD) generalization on graphs aims at dealing with scenarios where the test graph distribution differs from the training graph distributions. Compared to i.i.d. data like images, the OOD generalization problem on graph-structured data remains challenging due to the non-i.i.d. property and complex structural information on graphs. Recently, several works on graph OOD generaliza… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07968v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07968v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07968v1-abstract-full" style="display: none;"> Out-of-distribution (OOD) generalization on graphs aims at dealing with scenarios where the test graph distribution differs from the training graph distributions. Compared to i.i.d. data like images, the OOD generalization problem on graph-structured data remains challenging due to the non-i.i.d. property and complex structural information on graphs. Recently, several works on graph OOD generalization have explored extracting invariant subgraphs that share crucial classification information across different distributions. Nevertheless, such a strategy could be suboptimal for entirely capturing the invariant information, as the extraction of discrete structures could potentially lead to the loss of invariant information or the involvement of spurious information. In this paper, we propose an innovative framework, named Generative Risk Minimization (GRM), designed to generate an invariant subgraph for each input graph to be classified, instead of extraction. To address the challenge of optimization in the absence of optimal invariant subgraphs (i.e., ground truths), we derive a tractable form of the proposed GRM objective by introducing a latent causal variable, and its effectiveness is validated by our theoretical analysis. We further conduct extensive experiments across a variety of real-world graph datasets for both node-level and graph-level OOD generalization, and the results demonstrate the superiority of our framework GRM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07968v1-abstract-full').style.display = 'none'; document.getElementById('2502.07968v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TMLR 02/2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07942">arXiv:2502.07942</a> <span> [<a href="https://arxiv.org/pdf/2502.07942">pdf</a>, <a href="https://arxiv.org/format/2502.07942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Symbiotic Cooperation for Web Agents: Harnessing Complementary Strengths of Large and Small LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Mufan Qiu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+V">Vincent Lu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jie Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Agudelo%2C+L+Z">Leandro Z. Agudelo</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+P">Peter Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07942v1-abstract-short" style="display: inline;"> Web browsing agents powered by large language models (LLMs) have shown tremendous potential in automating complex web-based tasks. Existing approaches typically rely on large LLMs (e.g., GPT-4o) to explore web environments and generate trajectory data, which is then used either for demonstration retrieval (for large LLMs) or to distill small LLMs (e.g., Llama3) in a process that remains decoupled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07942v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07942v1-abstract-full" style="display: none;"> Web browsing agents powered by large language models (LLMs) have shown tremendous potential in automating complex web-based tasks. Existing approaches typically rely on large LLMs (e.g., GPT-4o) to explore web environments and generate trajectory data, which is then used either for demonstration retrieval (for large LLMs) or to distill small LLMs (e.g., Llama3) in a process that remains decoupled from the exploration. In this paper, we propose AgentSymbiotic, an iterative framework that couples data synthesis with task-performance, yielding a "symbiotic improvement" for both large and small LLMs. Our study uncovers a complementary dynamic between LLM types: while large LLMs excel at generating high-quality trajectories for distillation, the distilled small LLMs-owing to their distinct reasoning capabilities-often choose actions that diverge from those of their larger counterparts. This divergence drives the exploration of novel trajectories, thereby enriching the synthesized data. However, we also observe that the performance of small LLMs becomes a bottleneck in this iterative enhancement process. To address this, we propose two innovations in LLM distillation: a speculative data synthesis strategy that mitigates off-policy bias, and a multi-task learning approach designed to boost the reasoning capabilities of the student LLM. Furthermore, we introduce a Hybrid Mode for Privacy Preservation to address user privacy concerns. Evaluated on the WEBARENA benchmark, AgentSymbiotic achieves SOTA performance with both LLM types. Our best Large LLM agent reaches 52%, surpassing the previous best of 45%, while our 8B distilled model demonstrates a competitive 49%, exceeding the prior best of 28%. Code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07942v1-abstract-full').style.display = 'none'; document.getElementById('2502.07942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07352">arXiv:2502.07352</a> <span> [<a href="https://arxiv.org/pdf/2502.07352">pdf</a>, <a href="https://arxiv.org/format/2502.07352">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> </div> </div> <p class="title is-5 mathjax"> Bridging the Evaluation Gap: Leveraging Large Language Models for Topic Model Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyin Tan</a>, <a href="/search/cs?searchtype=author&query=D%27Souza%2C+J">Jennifer D'Souza</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07352v1-abstract-short" style="display: inline;"> This study presents a framework for automated evaluation of dynamically evolving topic taxonomies in scientific literature using Large Language Models (LLMs). In digital library systems, topic modeling plays a crucial role in efficiently organizing and retrieving scholarly content, guiding researchers through complex knowledge landscapes. As research domains proliferate and shift, traditional huma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07352v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07352v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07352v1-abstract-full" style="display: none;"> This study presents a framework for automated evaluation of dynamically evolving topic taxonomies in scientific literature using Large Language Models (LLMs). In digital library systems, topic modeling plays a crucial role in efficiently organizing and retrieving scholarly content, guiding researchers through complex knowledge landscapes. As research domains proliferate and shift, traditional human centric and static evaluation methods struggle to maintain relevance. The proposed approach harnesses LLMs to measure key quality dimensions, such as coherence, repetitiveness, diversity, and topic-document alignment, without heavy reliance on expert annotators or narrow statistical metrics. Tailored prompts guide LLM assessments, ensuring consistent and interpretable evaluations across various datasets and modeling techniques. Experiments on benchmark corpora demonstrate the method's robustness, scalability, and adaptability, underscoring its value as a more holistic and dynamic alternative to conventional evaluation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07352v1-abstract-full').style.display = 'none'; document.getElementById('2502.07352v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by IRCDL 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02088">arXiv:2502.02088</a> <span> [<a href="https://arxiv.org/pdf/2502.02088">pdf</a>, <a href="https://arxiv.org/format/2502.02088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IPO: Iterative Preference Optimization for Text-to-Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyu Tan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+X">Xuecheng Nie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02088v2-abstract-short" style="display: inline;"> Video foundation models have achieved significant advancement with the help of network upgrade as well as model scale-up. However, they are still hard to meet requirements of applications due to unsatisfied generation quality. To solve this problem, we propose to align video foundation models with human preferences from the perspective of post-training in this paper. Consequently, we introduce an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02088v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02088v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02088v2-abstract-full" style="display: none;"> Video foundation models have achieved significant advancement with the help of network upgrade as well as model scale-up. However, they are still hard to meet requirements of applications due to unsatisfied generation quality. To solve this problem, we propose to align video foundation models with human preferences from the perspective of post-training in this paper. Consequently, we introduce an Iterative Preference Optimization strategy to enhance generated video quality by incorporating human feedback. Specifically, IPO exploits a critic model to justify video generations for pairwise ranking as in Direct Preference Optimization or point-wise scoring as in Kahneman-Tversky Optimization. Given this, IPO optimizes video foundation models with guidance of signals from preference feedback, which helps improve generated video quality in subject consistency, motion smoothness and aesthetic quality, etc. In addition, IPO incorporates the critic model with the multi-modality large language model, which enables it to automatically assign preference labels without need of retraining or relabeling. In this way, IPO can efficiently perform multi-round preference optimization in an iterative manner, without the need of tediously manual labeling. Comprehensive experiments demonstrate that the proposed IPO can effectively improve the video generation quality of a pretrained model and help a model with only 2B parameters surpass the one with 5B parameters. Besides, IPO achieves new state-of-the-art performance on VBench benchmark. We will release our source codes, models as well as dataset to advance future research and applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02088v2-abstract-full').style.display = 'none'; document.getElementById('2502.02088v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16365">arXiv:2501.16365</a> <span> [<a href="https://arxiv.org/pdf/2501.16365">pdf</a>, <a href="https://arxiv.org/format/2501.16365">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CAND: Cross-Domain Ambiguity Inference for Early Detecting Nuanced Illness Deterioration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ting%2C+L+P">Lo Pang-Yun Ting</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong-Pei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng-Te Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Po-Lin Chen</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+K">Kun-Ta Chuang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16365v1-abstract-short" style="display: inline;"> Early detection of patient deterioration is essential for timely treatment, with vital signs like heart rates being key health indicators. Existing methods tend to solely analyze vital sign waveforms, ignoring transition relationships of waveforms within each vital sign and the correlation strengths among various vital signs. Such studies often overlook nuanced illness deterioration, which is the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16365v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16365v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16365v1-abstract-full" style="display: none;"> Early detection of patient deterioration is essential for timely treatment, with vital signs like heart rates being key health indicators. Existing methods tend to solely analyze vital sign waveforms, ignoring transition relationships of waveforms within each vital sign and the correlation strengths among various vital signs. Such studies often overlook nuanced illness deterioration, which is the early sign of worsening health but is difficult to detect. In this paper, we introduce CAND, a novel method that organizes the transition relationships and the correlations within and among vital signs as domain-specific and cross-domain knowledge. CAND jointly models these knowledge in a unified representation space, considerably enhancing the early detection of nuanced illness deterioration. In addition, CAND integrates a Bayesian inference method that utilizes augmented knowledge from domain-specific and cross-domain knowledge to address the ambiguities in correlation strengths. With this architecture, the correlation strengths can be effectively inferred to guide joint modeling and enhance representations of vital signs. This allows a more holistic and accurate interpretation of patient health. Our experiments on a real-world ICU dataset demonstrate that CAND significantly outperforms existing methods in both effectiveness and earliness in detecting nuanced illness deterioration. Moreover, we conduct a case study for the interpretable detection process to showcase the practicality of CAND. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16365v1-abstract-full').style.display = 'none'; document.getElementById('2501.16365v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14431">arXiv:2501.14431</a> <span> [<a href="https://arxiv.org/pdf/2501.14431">pdf</a>, <a href="https://arxiv.org/format/2501.14431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Domaino1s: Guiding LLM Reasoning for Explainable Answers in High-Stakes Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hanlin Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guanyu Wang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14431v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) are widely applied to downstream domains. However, current LLMs for high-stakes domain tasks, such as financial investment and legal QA, typically generate brief answers without reasoning processes and explanations. This limits users' confidence in making decisions based on their responses. While original CoT shows promise, it lacks self-correction mechanisms during re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14431v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14431v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14431v1-abstract-full" style="display: none;"> Large Language Models (LLMs) are widely applied to downstream domains. However, current LLMs for high-stakes domain tasks, such as financial investment and legal QA, typically generate brief answers without reasoning processes and explanations. This limits users' confidence in making decisions based on their responses. While original CoT shows promise, it lacks self-correction mechanisms during reasoning. This work introduces Domain$o1$s, which enhances LLMs' reasoning capabilities on domain tasks through supervised fine-tuning and tree search. We construct CoT-stock-2k and CoT-legal-2k datasets for fine-tuning models that activate domain-specific reasoning steps based on their judgment. Additionally, we propose Selective Tree Exploration to spontaneously explore solution spaces and sample optimal reasoning paths to improve performance. We also introduce PROOF-Score, a new metric for evaluating domain models' explainability, complementing traditional accuracy metrics with richer assessment dimensions. Extensive experiments on stock investment recommendation and legal reasoning QA tasks demonstrate Domaino1s's leading performance and explainability. Our code is available at https://anonymous.4open.science/r/Domaino1s-006F/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14431v1-abstract-full').style.display = 'none'; document.getElementById('2501.14431v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14427">arXiv:2501.14427</a> <span> [<a href="https://arxiv.org/pdf/2501.14427">pdf</a>, <a href="https://arxiv.org/format/2501.14427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GraphSOS: Graph Sampling and Order Selection to Help LLMs Understand Graphs Better </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hanlin Xue</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingce Wang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14427v3-abstract-short" style="display: inline;"> The success of Large Language Models (LLMs) in various domains has led researchers to apply them to graph-related problems by converting graph data into natural language text. However, unlike graph data, natural language inherently has sequential order. We observe a counter-intuitive fact that when the order of nodes or edges in the natural language description of a graph is shuffled, despite desc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14427v3-abstract-full').style.display = 'inline'; document.getElementById('2501.14427v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14427v3-abstract-full" style="display: none;"> The success of Large Language Models (LLMs) in various domains has led researchers to apply them to graph-related problems by converting graph data into natural language text. However, unlike graph data, natural language inherently has sequential order. We observe a counter-intuitive fact that when the order of nodes or edges in the natural language description of a graph is shuffled, despite describing the same graph, model performance fluctuates between high performance and random guessing. Additionally, due to LLMs' limited input context length, current methods typically randomly sample neighbors of target nodes as representatives of their neighborhood, which may not always be effective for accurate reasoning. To address these gaps, we introduce GraphSOS (Graph Sampling and Order Selection). This novel model framework features an Order Selector Module to ensure proper serialization order of the graph and a Subgraph Sampling Module to sample subgraphs with better structure for better reasoning. Furthermore, we propose Graph CoT obtained through distillation, and enhance LLM's reasoning and zero-shot learning capabilities for graph tasks through instruction tuning. Experiments on multiple datasets for node classification and graph question-answering demonstrate that GraphSOS improves LLMs' performance and generalization ability on graph tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14427v3-abstract-full').style.display = 'none'; document.getElementById('2501.14427v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13391">arXiv:2501.13391</a> <span> [<a href="https://arxiv.org/pdf/2501.13391">pdf</a>, <a href="https://arxiv.org/format/2501.13391">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Understand Preferences in Personalized Recommendation? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zinan Zeng</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingkai Zeng</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+F">Fengran Mo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13391v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in various tasks, including personalized recommendations. Existing evaluation methods often focus on rating prediction, relying on regression errors between actual and predicted ratings. However, user rating bias and item quality, two influential factors behind rating scores, can obscure personal preferences in user-item pair data. To address this, we introduce P… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13391v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13391v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13391v1-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in various tasks, including personalized recommendations. Existing evaluation methods often focus on rating prediction, relying on regression errors between actual and predicted ratings. However, user rating bias and item quality, two influential factors behind rating scores, can obscure personal preferences in user-item pair data. To address this, we introduce PerRecBench, disassociating the evaluation from these two factors and assessing recommendation techniques on capturing the personal preferences in a grouped ranking manner. We find that the LLM-based recommendation techniques that are generally good at rating prediction fail to identify users' favored and disfavored items when the user rating bias and item quality are eliminated by grouping users. With PerRecBench and 19 LLMs, we find that while larger models generally outperform smaller ones, they still struggle with personalized recommendation. Our findings reveal the superiority of pairwise and listwise ranking approaches over pointwise ranking, PerRecBench's low correlation with traditional regression metrics, the importance of user profiles, and the role of pretraining data distributions. We further explore three supervised fine-tuning strategies, finding that merging weights from single-format training is promising but improving LLMs' understanding of user preferences remains an open research problem. Code and data are available at https://github.com/TamSiuhin/PerRecBench <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13391v1-abstract-full').style.display = 'none'; document.getElementById('2501.13391v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.11512">arXiv:2501.11512</a> <span> [<a href="https://arxiv.org/pdf/2501.11512">pdf</a>, <a href="https://arxiv.org/format/2501.11512">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multitask Auxiliary Network for Perceptual Quality Assessment of Non-Uniformly Distorted Omnidirectional Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jiebin Yan</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+J">Jiale Rao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junjie Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Ziwen Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Weide Liu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuming Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.11512v1-abstract-short" style="display: inline;"> Omnidirectional image quality assessment (OIQA) has been widely investigated in the past few years and achieved much success. However, most of existing studies are dedicated to solve the uniform distortion problem in OIQA, which has a natural gap with the non-uniform distortion problem, and their ability in capturing non-uniform distortion is far from satisfactory. To narrow this gap, in this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11512v1-abstract-full').style.display = 'inline'; document.getElementById('2501.11512v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.11512v1-abstract-full" style="display: none;"> Omnidirectional image quality assessment (OIQA) has been widely investigated in the past few years and achieved much success. However, most of existing studies are dedicated to solve the uniform distortion problem in OIQA, which has a natural gap with the non-uniform distortion problem, and their ability in capturing non-uniform distortion is far from satisfactory. To narrow this gap, in this paper, we propose a multitask auxiliary network for non-uniformly distorted omnidirectional images, where the parameters are optimized by jointly training the main task and other auxiliary tasks. The proposed network mainly consists of three parts: a backbone for extracting multiscale features from the viewport sequence, a multitask feature selection module for dynamically allocating specific features to different tasks, and auxiliary sub-networks for guiding the proposed model to capture local distortion and global quality change. Extensive experiments conducted on two large-scale OIQA databases demonstrate that the proposed model outperforms other state-of-the-art OIQA metrics, and these auxiliary sub-networks contribute to improve the performance of the proposed model. The source code is available at https://github.com/RJL2000/MTAOIQA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.11512v1-abstract-full').style.display = 'none'; document.getElementById('2501.11512v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10011">arXiv:2501.10011</a> <span> [<a href="https://arxiv.org/pdf/2501.10011">pdf</a>, <a href="https://arxiv.org/format/2501.10011">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mitigating Hallucinations on Object Attributes using Multiview Images and Negative Instructions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuzhi Li</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+S">Shengwei Meng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiang Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingce Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10011v1-abstract-short" style="display: inline;"> Current popular Large Vision-Language Models (LVLMs) are suffering from Hallucinations on Object Attributes (HoOA), leading to incorrect determination of fine-grained attributes in the input images. Leveraging significant advancements in 3D generation from a single image, this paper proposes a novel method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled from generated 3D r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10011v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10011v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10011v1-abstract-full" style="display: none;"> Current popular Large Vision-Language Models (LVLMs) are suffering from Hallucinations on Object Attributes (HoOA), leading to incorrect determination of fine-grained attributes in the input images. Leveraging significant advancements in 3D generation from a single image, this paper proposes a novel method to mitigate HoOA in LVLMs. This method utilizes multiview images sampled from generated 3D representations as visual prompts for LVLMs, thereby providing more visual information from other viewpoints. Furthermore, we observe the input order of multiple multiview images significantly affects the performance of LVLMs. Consequently, we have devised Multiview Image Augmented VLM (MIAVLM), incorporating a Multiview Attributes Perceiver (MAP) submodule capable of simultaneously eliminating the influence of input image order and aligning visual information from multiview images with Large Language Models (LLMs). Besides, we designed and employed negative instructions to mitigate LVLMs' bias towards ``Yes" responses. Comprehensive experiments demonstrate the effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10011v1-abstract-full').style.display = 'none'; document.getElementById('2501.10011v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10010">arXiv:2501.10010</a> <span> [<a href="https://arxiv.org/pdf/2501.10010">pdf</a>, <a href="https://arxiv.org/format/2501.10010">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Spatiotemporal Augmentation for Improving Dynamic Graph Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+H">Hanlin Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingce Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+T">Tuoyu Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10010v1-abstract-short" style="display: inline;"> Dynamic graph augmentation is used to improve the performance of dynamic GNNs. Most methods assume temporal locality, meaning that recent edges are more influential than earlier edges. However, for temporal changes in edges caused by random noise, overemphasizing recent edges while neglecting earlier ones may lead to the model capturing noise. To address this issue, we propose STAA (SpatioTemporal… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10010v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10010v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10010v1-abstract-full" style="display: none;"> Dynamic graph augmentation is used to improve the performance of dynamic GNNs. Most methods assume temporal locality, meaning that recent edges are more influential than earlier edges. However, for temporal changes in edges caused by random noise, overemphasizing recent edges while neglecting earlier ones may lead to the model capturing noise. To address this issue, we propose STAA (SpatioTemporal Activity-Aware Random Walk Diffusion). STAA identifies nodes likely to have noisy edges in spatiotemporal dimensions. Spatially, it analyzes critical topological positions through graph wavelet coefficients. Temporally, it analyzes edge evolution through graph wavelet coefficient change rates. Then, random walks are used to reduce the weights of noisy edges, deriving a diffusion matrix containing spatiotemporal information as an augmented adjacency matrix for dynamic GNN learning. Experiments on multiple datasets show that STAA outperforms other dynamic graph augmentation methods in node classification and link prediction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10010v1-abstract-full').style.display = 'none'; document.getElementById('2501.10010v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2025 IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06146">arXiv:2501.06146</a> <span> [<a href="https://arxiv.org/pdf/2501.06146">pdf</a>, <a href="https://arxiv.org/format/2501.06146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> xLSTM-SENet: xLSTM for Single-Channel Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=K%C3%BChne%2C+N+L">Nikolai Lund K眉hne</a>, <a href="/search/cs?searchtype=author&query=%C3%98stergaard%2C+J">Jan 脴stergaard</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+J">Jesper Jensen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng-Hua Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06146v1-abstract-short" style="display: inline;"> While attention-based architectures, such as Conformers, excel in speech enhancement, they face challenges such as scalability with respect to input sequence length. In contrast, the recently proposed Extended Long Short-Term Memory (xLSTM) architecture offers linear scalability. However, xLSTM-based models remain unexplored for speech enhancement. This paper introduces xLSTM-SENet, the first xLST… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06146v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06146v1-abstract-full" style="display: none;"> While attention-based architectures, such as Conformers, excel in speech enhancement, they face challenges such as scalability with respect to input sequence length. In contrast, the recently proposed Extended Long Short-Term Memory (xLSTM) architecture offers linear scalability. However, xLSTM-based models remain unexplored for speech enhancement. This paper introduces xLSTM-SENet, the first xLSTM-based single-channel speech enhancement system. A comparative analysis reveals that xLSTM-and notably, even LSTM-can match or outperform state-of-the-art Mamba- and Conformer-based systems across various model sizes in speech enhancement on the VoiceBank+Demand dataset. Through ablation studies, we identify key architectural design choices such as exponential gating and bidirectionality contributing to its effectiveness. Our best xLSTM-based model, xLSTM-SENet2, outperforms state-of-the-art Mamba- and Conformer-based systems on the Voicebank+DEMAND dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06146v1-abstract-full').style.display = 'none'; document.getElementById('2501.06146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03523">arXiv:2501.03523</a> <span> [<a href="https://arxiv.org/pdf/2501.03523">pdf</a>, <a href="https://arxiv.org/ps/2501.03523">ps</a>, <a href="https://arxiv.org/format/2501.03523">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Vocal Tract Length Warped Features for Spoken Keyword Spotting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sarkar%2C+A+k">Achintya kr. Sarkar</a>, <a href="/search/cs?searchtype=author&query=Dwivedi%2C+P">Priyanka Dwivedi</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng-Hua Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03523v1-abstract-short" style="display: inline;"> In this paper, we propose several methods that incorporate vocal tract length (VTL) warped features for spoken keyword spotting (KWS). The first method, VTL-independent KWS, involves training a single deep neural network (DNN) that utilizes VTL features with various warping factors. During training, a specific VTL feature is randomly selected per epoch, allowing the exploration of VTL variations.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03523v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03523v1-abstract-full" style="display: none;"> In this paper, we propose several methods that incorporate vocal tract length (VTL) warped features for spoken keyword spotting (KWS). The first method, VTL-independent KWS, involves training a single deep neural network (DNN) that utilizes VTL features with various warping factors. During training, a specific VTL feature is randomly selected per epoch, allowing the exploration of VTL variations. During testing, the VTL features with different warping factors of a test utterance are scored against the DNN and combined with equal weight. In the second method scores the conventional features of a test utterance (without VTL warping) against the DNN. The third method, VTL-concatenation KWS, concatenates VTL warped features to form high-dimensional features for KWS. Evaluations carried out on the English Google Command dataset demonstrate that the proposed methods improve the accuracy of KWS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03523v1-abstract-full').style.display = 'none'; document.getElementById('2501.03523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03184">arXiv:2501.03184</a> <span> [<a href="https://arxiv.org/pdf/2501.03184">pdf</a>, <a href="https://arxiv.org/format/2501.03184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Noise-Robust Target-Speaker Voice Activity Detection Through Self-Supervised Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bovbjerg%2C+H+S">Holger Severin Bovbjerg</a>, <a href="/search/cs?searchtype=author&query=%C3%98stergaard%2C+J">Jan 脴stergaard</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+J">Jesper Jensen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zheng-Hua Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03184v1-abstract-short" style="display: inline;"> Target-Speaker Voice Activity Detection (TS-VAD) is the task of detecting the presence of speech from a known target-speaker in an audio frame. Recently, deep neural network-based models have shown good performance in this task. However, training these models requires extensive labelled data, which is costly and time-consuming to obtain, particularly if generalization to unseen environments is cru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03184v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03184v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03184v1-abstract-full" style="display: none;"> Target-Speaker Voice Activity Detection (TS-VAD) is the task of detecting the presence of speech from a known target-speaker in an audio frame. Recently, deep neural network-based models have shown good performance in this task. However, training these models requires extensive labelled data, which is costly and time-consuming to obtain, particularly if generalization to unseen environments is crucial. To mitigate this, we propose a causal, Self-Supervised Learning (SSL) pretraining framework, called Denoising Autoregressive Predictive Coding (DN-APC), to enhance TS-VAD performance in noisy conditions. We also explore various speaker conditioning methods and evaluate their performance under different noisy conditions. Our experiments show that DN-APC improves performance in noisy conditions, with a general improvement of approx. 2% in both seen and unseen noise. Additionally, we find that FiLM conditioning provides the best overall performance. Representation analysis via tSNE plots reveals robust initial representations of speech and non-speech from pretraining. This underscores the effectiveness of SSL pretraining in improving the robustness and performance of TS-VAD models in noisy environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03184v1-abstract-full').style.display = 'none'; document.getElementById('2501.03184v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IEEE/ACM Transactions on Audio, Speech, and Language Processing for possible publication. 12 pages, 4 figures, 5 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02765">arXiv:2501.02765</a> <span> [<a href="https://arxiv.org/pdf/2501.02765">pdf</a>, <a href="https://arxiv.org/format/2501.02765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Visual Large Language Models for Generalized and Specialized Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifan Li</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zhixin Lai</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+W">Wentao Bao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Dao%2C+A">Anh Dao</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+K">Kewei Sui</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dong Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Y">Yu Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02765v1-abstract-short" style="display: inline;"> Visual-language models (VLM) have emerged as a powerful tool for learning a unified embedding space for vision and language. Inspired by large language models, which have demonstrated strong reasoning and multi-task capabilities, visual large language models (VLLMs) are gaining increasing attention for building general-purpose VLMs. Despite the significant progress made in VLLMs, the related liter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02765v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02765v1-abstract-full" style="display: none;"> Visual-language models (VLM) have emerged as a powerful tool for learning a unified embedding space for vision and language. Inspired by large language models, which have demonstrated strong reasoning and multi-task capabilities, visual large language models (VLLMs) are gaining increasing attention for building general-purpose VLMs. Despite the significant progress made in VLLMs, the related literature remains limited, particularly from a comprehensive application perspective, encompassing generalized and specialized applications across vision (image, video, depth), action, and language modalities. In this survey, we focus on the diverse applications of VLLMs, examining their using scenarios, identifying ethics consideration and challenges, and discussing future directions for their development. By synthesizing these contents, we aim to provide a comprehensive guide that will pave the way for future innovations and broader applications of VLLMs. The paper list repository is available: https://github.com/JackYFL/awesome-VLLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02765v1-abstract-full').style.display = 'none'; document.getElementById('2501.02765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02376">arXiv:2501.02376</a> <span> [<a href="https://arxiv.org/pdf/2501.02376">pdf</a>, <a href="https://arxiv.org/format/2501.02376">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generalizable Origin Identification for Text-Guided Image-to-Image Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenhao Wang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yifan Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zongxin Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhentao Tan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhengdong Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yi Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02376v1-abstract-short" style="display: inline;"> Text-guided image-to-image diffusion models excel in translating images based on textual prompts, allowing for precise and creative visual modifications. However, such a powerful technique can be misused for spreading misinformation, infringing on copyrights, and evading content tracing. This motivates us to introduce the task of origin IDentification for text-guided Image-to-image Diffusion model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02376v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02376v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02376v1-abstract-full" style="display: none;"> Text-guided image-to-image diffusion models excel in translating images based on textual prompts, allowing for precise and creative visual modifications. However, such a powerful technique can be misused for spreading misinformation, infringing on copyrights, and evading content tracing. This motivates us to introduce the task of origin IDentification for text-guided Image-to-image Diffusion models (ID$^2$), aiming to retrieve the original image of a given translated query. A straightforward solution to ID$^2$ involves training a specialized deep embedding model to extract and compare features from both query and reference images. However, due to visual discrepancy across generations produced by different diffusion models, this similarity-based approach fails when training on images from one model and testing on those from another, limiting its effectiveness in real-world applications. To solve this challenge of the proposed ID$^2$ task, we contribute the first dataset and a theoretically guaranteed method, both emphasizing generalizability. The curated dataset, OriPID, contains abundant Origins and guided Prompts, which can be used to train and test potential IDentification models across various diffusion models. In the method section, we first prove the existence of a linear transformation that minimizes the distance between the pre-trained Variational Autoencoder (VAE) embeddings of generated samples and their origins. Subsequently, it is demonstrated that such a simple linear transformation can be generalized across different diffusion models. Experimental results show that the proposed method achieves satisfying generalization performance, significantly surpassing similarity-based methods ($+31.6\%$ mAP), even those with generalization designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02376v1-abstract-full').style.display = 'none'; document.getElementById('2501.02376v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.21044">arXiv:2412.21044</a> <span> [<a href="https://arxiv.org/pdf/2412.21044">pdf</a>, <a href="https://arxiv.org/format/2412.21044">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> E2EDiff: Direct Mapping from Noise to Data for Enhanced Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyu Tan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+W">WenXu Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hesen Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mengping Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.21044v1-abstract-short" style="display: inline;"> Diffusion models have emerged as a powerful framework for generative modeling, achieving state-of-the-art performance across various tasks. However, they face several inherent limitations, including a training-sampling gap, information leakage in the progressive noising process, and the inability to incorporate advanced loss functions like perceptual and adversarial losses during training. To addr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.21044v1-abstract-full').style.display = 'inline'; document.getElementById('2412.21044v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.21044v1-abstract-full" style="display: none;"> Diffusion models have emerged as a powerful framework for generative modeling, achieving state-of-the-art performance across various tasks. However, they face several inherent limitations, including a training-sampling gap, information leakage in the progressive noising process, and the inability to incorporate advanced loss functions like perceptual and adversarial losses during training. To address these challenges, we propose an innovative end-to-end training framework that aligns the training and sampling processes by directly optimizing the final reconstruction output. Our method eliminates the training-sampling gap, mitigates information leakage by treating the training process as a direct mapping from pure noise to the target data distribution, and enables the integration of perceptual and adversarial losses into the objective. Extensive experiments on benchmarks such as COCO30K and HW30K demonstrate that our approach consistently outperforms traditional diffusion models, achieving superior results in terms of FID and CLIP score, even with reduced sampling steps. These findings highlight the potential of end-to-end training to advance diffusion-based generative models toward more robust and efficient solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.21044v1-abstract-full').style.display = 'none'; document.getElementById('2412.21044v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">technical report, to be further updated</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20740">arXiv:2412.20740</a> <span> [<a href="https://arxiv.org/pdf/2412.20740">pdf</a>, <a href="https://arxiv.org/format/2412.20740">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Similar but Patched Code Considered Harmful -- The Impact of Similar but Patched Code on Recurring Vulnerability Detection and How to Remove Them </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zixuan Tan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiayuan Zhou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xing Hu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+S">Shengyi Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kui Liu</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20740v1-abstract-short" style="display: inline;"> Identifying recurring vulnerabilities is crucial for ensuring software security. Clone-based techniques, while widely used, often generate many false alarms due to the existence of similar but patched (SBP) code, which is similar to vulnerable code but is not vulnerable due to having been patched. Although the SBP code poses a great challenge to the effectiveness of existing approaches, it has not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20740v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20740v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20740v1-abstract-full" style="display: none;"> Identifying recurring vulnerabilities is crucial for ensuring software security. Clone-based techniques, while widely used, often generate many false alarms due to the existence of similar but patched (SBP) code, which is similar to vulnerable code but is not vulnerable due to having been patched. Although the SBP code poses a great challenge to the effectiveness of existing approaches, it has not yet been well explored. In this paper, we propose a programming language agnostic framework, Fixed Vulnerability Filter (FVF), to identify and filter such SBP instances in vulnerability detection. Different from existing studies that leverage function signatures, our approach analyzes code change histories to precisely pinpoint SBPs and consequently reduce false alarms. Evaluation under practical scenarios confirms the effectiveness and precision of our approach. Remarkably, FVF identifies and filters 65.1% of false alarms from four vulnerability detection tools (i.e., ReDeBug, VUDDY, MVP, and an elementary hash-based approach) without yielding false positives. We further apply FVF to 1,081 real-world software projects and construct a real-world SBP dataset containing 6,827 SBP functions. Due to the SBP nature, the dataset can act as a strict benchmark to test the sensitivity of the vulnerability detection approach in distinguishing real vulnerabilities and SBPs. Using this dataset, we demonstrate the ineffectiveness of four state-of-the-art deep learning-based vulnerability detection approaches. Our dataset can help developers make a more realistic evaluation of vulnerability detection approaches and also paves the way for further exploration of real-world SBP scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20740v1-abstract-full').style.display = 'none'; document.getElementById('2412.20740v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17404">arXiv:2412.17404</a> <span> [<a href="https://arxiv.org/pdf/2412.17404">pdf</a>, <a href="https://arxiv.org/format/2412.17404">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BrainMAP: Learning Multiple Activation Pathways in Brain Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhenyu Lei</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiaqi Ding</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yushun Dong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guorong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jundong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17404v2-abstract-short" style="display: inline;"> Functional Magnetic Resonance Image (fMRI) is commonly employed to study human brain activity, since it offers insight into the relationship between functional fluctuations and human behavior. To enhance analysis and comprehension of brain activity, Graph Neural Networks (GNNs) have been widely applied to the analysis of functional connectivities (FC) derived from fMRI data, due to their ability t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17404v2-abstract-full').style.display = 'inline'; document.getElementById('2412.17404v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17404v2-abstract-full" style="display: none;"> Functional Magnetic Resonance Image (fMRI) is commonly employed to study human brain activity, since it offers insight into the relationship between functional fluctuations and human behavior. To enhance analysis and comprehension of brain activity, Graph Neural Networks (GNNs) have been widely applied to the analysis of functional connectivities (FC) derived from fMRI data, due to their ability to capture the synergistic interactions among brain regions. However, in the human brain, performing complex tasks typically involves the activation of certain pathways, which could be represented as paths across graphs. As such, conventional GNNs struggle to learn from these pathways due to the long-range dependencies of multiple pathways. To address these challenges, we introduce a novel framework BrainMAP to learn Multiple Activation Pathways in Brain networks. BrainMAP leverages sequential models to identify long-range correlations among sequentialized brain regions and incorporates an aggregation module based on Mixture of Experts (MoE) to learn from multiple pathways. Our comprehensive experiments highlight BrainMAP's superior performance. Furthermore, our framework enables explanatory analyses of crucial brain regions involved in tasks. Our code is provided at https://github.com/LzyFischer/Graph-Mamba. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17404v2-abstract-full').style.display = 'none'; document.getElementById('2412.17404v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.16112">arXiv:2412.16112</a> <span> [<a href="https://arxiv.org/pdf/2412.16112">pdf</a>, <a href="https://arxiv.org/format/2412.16112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CLEAR: Conv-Like Linearization Revs Pre-Trained Diffusion Transformers Up </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhenxiong Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.16112v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiT) have become a leading architecture in image generation. However, the quadratic complexity of attention mechanisms, which are responsible for modeling token-wise relationships, results in significant latency when generating high-resolution images. To address this issue, we aim at a linear attention mechanism in this paper that reduces the complexity of pre-trained DiTs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16112v1-abstract-full').style.display = 'inline'; document.getElementById('2412.16112v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.16112v1-abstract-full" style="display: none;"> Diffusion Transformers (DiT) have become a leading architecture in image generation. However, the quadratic complexity of attention mechanisms, which are responsible for modeling token-wise relationships, results in significant latency when generating high-resolution images. To address this issue, we aim at a linear attention mechanism in this paper that reduces the complexity of pre-trained DiTs to linear. We begin our exploration with a comprehensive summary of existing efficient attention mechanisms and identify four key factors crucial for successful linearization of pre-trained DiTs: locality, formulation consistency, high-rank attention maps, and feature integrity. Based on these insights, we introduce a convolution-like local attention strategy termed CLEAR, which limits feature interactions to a local window around each query token, and thus achieves linear complexity. Our experiments indicate that, by fine-tuning the attention layer on merely 10K self-generated samples for 10K iterations, we can effectively transfer knowledge from a pre-trained DiT to a student model with linear complexity, yielding results comparable to the teacher model. Simultaneously, it reduces attention computations by 99.5% and accelerates generation by 6.3 times for generating 8K-resolution images. Furthermore, we investigate favorable properties in the distilled attention layers, such as zero-shot generalization cross various models and plugins, and improved support for multi-GPU parallel inference. Models and codes are available here: https://github.com/Huage001/CLEAR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.16112v1-abstract-full').style.display = 'none'; document.getElementById('2412.16112v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.15529">arXiv:2412.15529</a> <span> [<a href="https://arxiv.org/pdf/2412.15529">pdf</a>, <a href="https://arxiv.org/format/2412.15529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> XRAG: eXamining the Core -- Benchmarking Foundational Components in Advanced Retrieval-Augmented Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+Q">Qianren Mao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yangyifei Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jinlong Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+H">Hanwen Hao</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhilong Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaolong Wang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+X">Xiao Guan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhenting Huang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+W">Weifeng Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+S">Shuyu Guo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Zhentao Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qili Zhang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+S">Siyuan Tao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yujie Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junnan Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhixing Tan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jie Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xudong Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Richong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.15529v2-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) synergizes the retrieval of pertinent data with the generative capabilities of Large Language Models (LLMs), ensuring that the generated output is not only contextually relevant but also accurate and current. We introduce XRAG, an open-source, modular codebase that facilitates exhaustive evaluation of the performance of foundational components of advanced RAG m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15529v2-abstract-full').style.display = 'inline'; document.getElementById('2412.15529v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.15529v2-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) synergizes the retrieval of pertinent data with the generative capabilities of Large Language Models (LLMs), ensuring that the generated output is not only contextually relevant but also accurate and current. We introduce XRAG, an open-source, modular codebase that facilitates exhaustive evaluation of the performance of foundational components of advanced RAG modules. These components are systematically categorized into four core phases: pre-retrieval, retrieval, post-retrieval, and generation. We systematically analyse them across reconfigured datasets, providing a comprehensive benchmark for their effectiveness. As the complexity of RAG systems continues to escalate, we underscore the critical need to identify potential failure points in RAG systems. We formulate a suite of experimental methodologies and diagnostic testing protocols to dissect the failure points inherent in RAG engineering. Subsequently, we proffer bespoke solutions aimed at bolstering the overall performance of these modules. Our work thoroughly evaluates the performance of advanced core components in RAG systems, providing insights into optimizations for prevalent failure points. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.15529v2-abstract-full').style.display = 'none'; document.getElementById('2412.15529v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14688">arXiv:2412.14688</a> <span> [<a href="https://arxiv.org/pdf/2412.14688">pdf</a>, <a href="https://arxiv.org/format/2412.14688">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Logic Induced High-Order Reasoning Network for Event-Event Relation Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+P">Peixin Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiang Zhao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Minghao Hu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+W">Weidong Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14688v1-abstract-short" style="display: inline;"> To understand a document with multiple events, event-event relation extraction (ERE) emerges as a crucial task, aiming to discern how natural events temporally or structurally associate with each other. To achieve this goal, our work addresses the problems of temporal event relation extraction (TRE) and subevent relation extraction (SRE). The latest methods for such problems have commonly built do… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14688v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14688v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14688v1-abstract-full" style="display: none;"> To understand a document with multiple events, event-event relation extraction (ERE) emerges as a crucial task, aiming to discern how natural events temporally or structurally associate with each other. To achieve this goal, our work addresses the problems of temporal event relation extraction (TRE) and subevent relation extraction (SRE). The latest methods for such problems have commonly built document-level event graphs for global reasoning across sentences. However, the edges between events are usually derived from external tools heuristically, which are not always reliable and may introduce noise. Moreover, they are not capable of preserving logical constraints among event relations, e.g., coreference constraint, symmetry constraint and conjunction constraint. These constraints guarantee coherence between different relation types,enabling the generation of a uniffed event evolution graph. In this work, we propose a novel method named LogicERE, which performs high-order event relation reasoning through modeling logic constraints. Speciffcally, different from conventional event graphs, we design a logic constraint induced graph (LCG) without any external tools. LCG involves event nodes where the interactions among them can model the coreference constraint, and event pairs nodes where the interactions among them can retain the symmetry constraint and conjunction constraint. Then we perform high-order reasoning on LCG with relational graph transformer to obtain enhanced event and event pair embeddings. Finally, we further incorporate logic constraint information via a joint logic learning module. Extensive experiments demonstrate the effectiveness of the proposed method with state-of-the-art performance on benchmark datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14688v1-abstract-full').style.display = 'none'; document.getElementById('2412.14688v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14191">arXiv:2412.14191</a> <span> [<a href="https://arxiv.org/pdf/2412.14191">pdf</a>, <a href="https://arxiv.org/format/2412.14191">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Ontology-Aware RAG for Improved Question-Answering in Cybersecurity Education </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengshuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+G">Garima Agrawal</a>, <a href="/search/cs?searchtype=author&query=Kumarage%2C+T">Tharindu Kumarage</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yuli Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14191v1-abstract-short" style="display: inline;"> Integrating AI into education has the potential to transform the teaching of science and technology courses, particularly in the field of cybersecurity. AI-driven question-answering (QA) systems can actively manage uncertainty in cybersecurity problem-solving, offering interactive, inquiry-based learning experiences. Large language models (LLMs) have gained prominence in AI-driven QA systems, offe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14191v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14191v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14191v1-abstract-full" style="display: none;"> Integrating AI into education has the potential to transform the teaching of science and technology courses, particularly in the field of cybersecurity. AI-driven question-answering (QA) systems can actively manage uncertainty in cybersecurity problem-solving, offering interactive, inquiry-based learning experiences. Large language models (LLMs) have gained prominence in AI-driven QA systems, offering advanced language understanding and user engagement. However, they face challenges like hallucinations and limited domain-specific knowledge, which reduce their reliability in educational settings. To address these challenges, we propose CyberRAG, an ontology-aware retrieval-augmented generation (RAG) approach for developing a reliable and safe QA system in cybersecurity education. CyberRAG employs a two-step approach: first, it augments the domain-specific knowledge by retrieving validated cybersecurity documents from a knowledge base to enhance the relevance and accuracy of the response. Second, it mitigates hallucinations and misuse by integrating a knowledge graph ontology to validate the final answer. Experiments on publicly available cybersecurity datasets show that CyberRAG delivers accurate, reliable responses aligned with domain knowledge, demonstrating the potential of AI tools to enhance education. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14191v1-abstract-full').style.display = 'none'; document.getElementById('2412.14191v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10443">arXiv:2412.10443</a> <span> [<a href="https://arxiv.org/pdf/2412.10443">pdf</a>, <a href="https://arxiv.org/format/2412.10443">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SweetTokenizer: Semantic-Aware Spatial-Temporal Tokenizer for Compact Visual Discretization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhentao Tan</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Ben Xue</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jian Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junhao Wang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wencai Ye</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Shaoyun Shi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Mingjie Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjin Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Quan Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Peng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10443v2-abstract-short" style="display: inline;"> This paper presents the \textbf{S}emantic-a\textbf{W}ar\textbf{E} spatial-t\textbf{E}mporal \textbf{T}okenizer (SweetTokenizer), a compact yet effective discretization approach for vision data. Our goal is to boost tokenizers' compression ratio while maintaining reconstruction fidelity in the VQ-VAE paradigm. Firstly, to obtain compact latent representations, we decouple images or videos into spat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10443v2-abstract-full').style.display = 'inline'; document.getElementById('2412.10443v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10443v2-abstract-full" style="display: none;"> This paper presents the \textbf{S}emantic-a\textbf{W}ar\textbf{E} spatial-t\textbf{E}mporal \textbf{T}okenizer (SweetTokenizer), a compact yet effective discretization approach for vision data. Our goal is to boost tokenizers' compression ratio while maintaining reconstruction fidelity in the VQ-VAE paradigm. Firstly, to obtain compact latent representations, we decouple images or videos into spatial-temporal dimensions, translating visual information into learnable querying spatial and temporal tokens through a \textbf{C}ross-attention \textbf{Q}uery \textbf{A}uto\textbf{E}ncoder (CQAE). Secondly, to complement visual information during compression, we quantize these tokens via a specialized codebook derived from off-the-shelf LLM embeddings to leverage the rich semantics from language modality. Finally, to enhance training stability and convergence, we also introduce a curriculum learning strategy, which proves critical for effective discrete visual representation learning. SweetTokenizer achieves comparable video reconstruction fidelity with only \textbf{25\%} of the tokens used in previous state-of-the-art video tokenizers, and boost video generation results by \textbf{32.9\%} w.r.t gFVD. When using the same token number, we significantly improves video and image reconstruction results by \textbf{57.1\%} w.r.t rFVD on UCF-101 and \textbf{37.2\%} w.r.t rFID on ImageNet-1K. Additionally, the compressed tokens are imbued with semantic information, enabling few-shot recognition capabilities powered by LLMs in downstream applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10443v2-abstract-full').style.display = 'none'; document.getElementById('2412.10443v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09812">arXiv:2412.09812</a> <span> [<a href="https://arxiv.org/pdf/2412.09812">pdf</a>, <a href="https://arxiv.org/format/2412.09812">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ScaleOT: Privacy-utility-scalable Offsite-tuning with Dynamic LayerReplace and Selective Rank Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+K">Kai Yao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+T">Tiandi Ye</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lichun Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuan Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenyan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jianke Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09812v1-abstract-short" style="display: inline;"> Offsite-tuning is a privacy-preserving method for tuning large language models (LLMs) by sharing a lossy compressed emulator from the LLM owners with data owners for downstream task tuning. This approach protects the privacy of both the model and data owners. However, current offsite tuning methods often suffer from adaptation degradation, high computational costs, and limited protection strength… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09812v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09812v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09812v1-abstract-full" style="display: none;"> Offsite-tuning is a privacy-preserving method for tuning large language models (LLMs) by sharing a lossy compressed emulator from the LLM owners with data owners for downstream task tuning. This approach protects the privacy of both the model and data owners. However, current offsite tuning methods often suffer from adaptation degradation, high computational costs, and limited protection strength due to uniformly dropping LLM layers or relying on expensive knowledge distillation. To address these issues, we propose ScaleOT, a novel privacy-utility-scalable offsite-tuning framework that effectively balances privacy and utility. ScaleOT introduces a novel layerwise lossy compression algorithm that uses reinforcement learning to obtain the importance of each layer. It employs lightweight networks, termed harmonizers, to replace the raw LLM layers. By combining important original LLM layers and harmonizers in different ratios, ScaleOT generates emulators tailored for optimal performance with various model scales for enhanced privacy protection. Additionally, we present a rank reduction method to further compress the original LLM layers, significantly enhancing privacy with negligible impact on utility. Comprehensive experiments show that ScaleOT can achieve nearly lossless offsite tuning performance compared with full fine-tuning while obtaining better model privacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09812v1-abstract-full').style.display = 'none'; document.getElementById('2412.09812v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by AAAI2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07019">arXiv:2412.07019</a> <span> [<a href="https://arxiv.org/pdf/2412.07019">pdf</a>, <a href="https://arxiv.org/format/2412.07019">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Assessing the Impact of Conspiracy Theories Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bohan Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Rao%2C+A">Ashwin Rao</a>, <a href="/search/cs?searchtype=author&query=Lerman%2C+K">Kristina Lerman</a>, <a href="/search/cs?searchtype=author&query=Bernard%2C+H+R">H. Russell Bernard</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07019v1-abstract-short" style="display: inline;"> Measuring the relative impact of CTs is important for prioritizing responses and allocating resources effectively, especially during crises. However, assessing the actual impact of CTs on the public poses unique challenges. It requires not only the collection of CT-specific knowledge but also diverse information from social, psychological, and cultural dimensions. Recent advancements in large lang… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07019v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07019v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07019v1-abstract-full" style="display: none;"> Measuring the relative impact of CTs is important for prioritizing responses and allocating resources effectively, especially during crises. However, assessing the actual impact of CTs on the public poses unique challenges. It requires not only the collection of CT-specific knowledge but also diverse information from social, psychological, and cultural dimensions. Recent advancements in large language models (LLMs) suggest their potential utility in this context, not only due to their extensive knowledge from large training corpora but also because they can be harnessed for complex reasoning. In this work, we develop datasets of popular CTs with human-annotated impacts. Borrowing insights from human impact assessment processes, we then design tailored strategies to leverage LLMs for performing human-like CT impact assessments. Through rigorous experiments, we textit{discover that an impact assessment mode using multi-step reasoning to analyze more CT-related evidence critically produces accurate results; and most LLMs demonstrate strong bias, such as assigning higher impacts to CTs presented earlier in the prompt, while generating less accurate impact assessments for emotionally charged and verbose CTs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07019v1-abstract-full').style.display = 'none'; document.getElementById('2412.07019v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04814">arXiv:2412.04814</a> <span> [<a href="https://arxiv.org/pdf/2412.04814">pdf</a>, <a href="https://arxiv.org/format/2412.04814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LiFT: Leveraging Human Feedback for Text-to-Video Model Alignment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yibin Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhiyu Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Junyan Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaomeng Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Cheng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04814v2-abstract-short" style="display: inline;"> Recent advancements in text-to-video (T2V) generative models have shown impressive capabilities. However, these models are still inadequate in aligning synthesized videos with human preferences (e.g., accurately reflecting text descriptions), which is particularly difficult to address, as human preferences are inherently subjective and challenging to formalize as objective functions. Therefore, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04814v2-abstract-full').style.display = 'inline'; document.getElementById('2412.04814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04814v2-abstract-full" style="display: none;"> Recent advancements in text-to-video (T2V) generative models have shown impressive capabilities. However, these models are still inadequate in aligning synthesized videos with human preferences (e.g., accurately reflecting text descriptions), which is particularly difficult to address, as human preferences are inherently subjective and challenging to formalize as objective functions. Therefore, this paper proposes LiFT, a novel fine-tuning method leveraging human feedback for T2V model alignment. Specifically, we first construct a Human Rating Annotation dataset, LiFT-HRA, consisting of approximately 10k human annotations, each including a score and its corresponding rationale. Based on this, we train a reward model LiFT-Critic to learn reward function effectively, which serves as a proxy for human judgment, measuring the alignment between given videos and human expectations. Lastly, we leverage the learned reward function to align the T2V model by maximizing the reward-weighted likelihood. As a case study, we apply our pipeline to CogVideoX-2B, showing that the fine-tuned model outperforms the CogVideoX-5B across all 16 metrics, highlighting the potential of human feedback in improving the alignment and quality of synthesized videos. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04814v2-abstract-full').style.display = 'none'; document.getElementById('2412.04814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://codegoat24.github.io/LiFT</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04448">arXiv:2412.04448</a> <span> [<a href="https://arxiv.org/pdf/2412.04448">pdf</a>, <a href="https://arxiv.org/format/2412.04448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Longtao Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yifan Zhang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jiachun Pan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhenxiong Tan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jiahao Lu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuanxin Tang</a>, <a href="/search/cs?searchtype=author&query=An%2C+B">Bo An</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Shuicheng Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04448v1-abstract-short" style="display: inline;"> Recent advances in video diffusion models have unlocked new potential for realistic audio-driven talking video generation. However, achieving seamless audio-lip synchronization, maintaining long-term identity consistency, and producing natural, audio-aligned expressions in generated talking videos remain significant challenges. To address these challenges, we propose Memory-guided EMOtion-aware di… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04448v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04448v1-abstract-full" style="display: none;"> Recent advances in video diffusion models have unlocked new potential for realistic audio-driven talking video generation. However, achieving seamless audio-lip synchronization, maintaining long-term identity consistency, and producing natural, audio-aligned expressions in generated talking videos remain significant challenges. To address these challenges, we propose Memory-guided EMOtion-aware diffusion (MEMO), an end-to-end audio-driven portrait animation approach to generate identity-consistent and expressive talking videos. Our approach is built around two key modules: (1) a memory-guided temporal module, which enhances long-term identity consistency and motion smoothness by developing memory states to store information from a longer past context to guide temporal modeling via linear attention; and (2) an emotion-aware audio module, which replaces traditional cross attention with multi-modal attention to enhance audio-video interaction, while detecting emotions from audio to refine facial expressions via emotion adaptive layer norm. Extensive quantitative and qualitative results demonstrate that MEMO generates more realistic talking videos across diverse image and audio types, outperforming state-of-the-art methods in overall quality, audio-lip synchronization, identity consistency, and expression-emotion alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04448v1-abstract-full').style.display = 'none'; document.getElementById('2412.04448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://memoavatar.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18905">arXiv:2411.18905</a> <span> [<a href="https://arxiv.org/pdf/2411.18905">pdf</a>, <a href="https://arxiv.org/format/2411.18905">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> FedRGL: Robust Federated Graph Learning for Label Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">De Li</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+H">Haodong Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qiyu Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhou Tan</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Zemin Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinyan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xianxian Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18905v1-abstract-short" style="display: inline;"> Federated Graph Learning (FGL) is a distributed machine learning paradigm based on graph neural networks, enabling secure and collaborative modeling of local graph data among clients. However, label noise can degrade the global model's generalization performance. Existing federated label noise learning methods, primarily focused on computer vision, often yield suboptimal results when applied to FG… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18905v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18905v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18905v1-abstract-full" style="display: none;"> Federated Graph Learning (FGL) is a distributed machine learning paradigm based on graph neural networks, enabling secure and collaborative modeling of local graph data among clients. However, label noise can degrade the global model's generalization performance. Existing federated label noise learning methods, primarily focused on computer vision, often yield suboptimal results when applied to FGL. To address this, we propose a robust federated graph learning method with label noise, termed FedRGL. FedRGL introduces dual-perspective consistency noise node filtering, leveraging both the global model and subgraph structure under class-aware dynamic thresholds. To enhance client-side training, we incorporate graph contrastive learning, which improves encoder robustness and assigns high-confidence pseudo-labels to noisy nodes. Additionally, we measure model quality via predictive entropy of unlabeled nodes, enabling adaptive robust aggregation of the global model. Comparative experiments on multiple real-world graph datasets show that FedRGL outperforms 12 baseline methods across various noise rates, types, and numbers of clients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18905v1-abstract-full').style.display = 'none'; document.getElementById('2411.18905v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17073">arXiv:2411.17073</a> <span> [<a href="https://arxiv.org/pdf/2411.17073">pdf</a>, <a href="https://arxiv.org/format/2411.17073">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Path-RAG: Knowledge-Guided Key Region Retrieval for Open-ended Pathology Visual Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Naeem%2C+A">Awais Naeem</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianhao Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+H">Huang-Ru Liao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiawei Xu</a>, <a href="/search/cs?searchtype=author&query=Mathew%2C+A+M">Aby M. Mathew</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zehao Zhu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Jaiswal%2C+A+K">Ajay Kumar Jaiswal</a>, <a href="/search/cs?searchtype=author&query=Salibian%2C+R+A">Raffi A. Salibian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Ziniu Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Ying Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17073v1-abstract-short" style="display: inline;"> Accurate diagnosis and prognosis assisted by pathology images are essential for cancer treatment selection and planning. Despite the recent trend of adopting deep-learning approaches for analyzing complex pathology images, they fall short as they often overlook the domain-expert understanding of tissue structure and cell composition. In this work, we focus on a challenging Open-ended Pathology VQA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17073v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17073v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17073v1-abstract-full" style="display: none;"> Accurate diagnosis and prognosis assisted by pathology images are essential for cancer treatment selection and planning. Despite the recent trend of adopting deep-learning approaches for analyzing complex pathology images, they fall short as they often overlook the domain-expert understanding of tissue structure and cell composition. In this work, we focus on a challenging Open-ended Pathology VQA (PathVQA-Open) task and propose a novel framework named Path-RAG, which leverages HistoCartography to retrieve relevant domain knowledge from pathology images and significantly improves performance on PathVQA-Open. Admitting the complexity of pathology image analysis, Path-RAG adopts a human-centered AI approach by retrieving domain knowledge using HistoCartography to select the relevant patches from pathology images. Our experiments suggest that domain guidance can significantly boost the accuracy of LLaVA-Med from 38% to 47%, with a notable gain of 28% for H&E-stained pathology images in the PathVQA-Open dataset. For longer-form question and answer pairs, our model consistently achieves significant improvements of 32.5% in ARCH-Open PubMed and 30.6% in ARCH-Open Books on H\&E images. Our code and dataset is available here (https://github.com/embedded-robotics/path-rag). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17073v1-abstract-full').style.display = 'none'; document.getElementById('2411.17073v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16594">arXiv:2411.16594</a> <span> [<a href="https://arxiv.org/pdf/2411.16594">pdf</a>, <a href="https://arxiv.org/format/2411.16594">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> From Generation to Judgment: Opportunities and Challenges of LLM-as-a-judge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bohan Jiang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Liangjie Huang</a>, <a href="/search/cs?searchtype=author&query=Beigi%2C+A">Alimohammad Beigi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengshuai Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Bhattacharjee%2C+A">Amrita Bhattacharjee</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxuan Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Canyu Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tianhao Wu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+L">Lu Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16594v6-abstract-short" style="display: inline;"> Assessment and evaluation have long been critical challenges in artificial intelligence (AI) and natural language processing (NLP). However, traditional methods, whether matching-based or embedding-based, often fall short of judging subtle attributes and delivering satisfactory results. Recent advancements in Large Language Models (LLMs) inspire the "LLM-as-a-judge" paradigm, where LLMs are levera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16594v6-abstract-full').style.display = 'inline'; document.getElementById('2411.16594v6-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16594v6-abstract-full" style="display: none;"> Assessment and evaluation have long been critical challenges in artificial intelligence (AI) and natural language processing (NLP). However, traditional methods, whether matching-based or embedding-based, often fall short of judging subtle attributes and delivering satisfactory results. Recent advancements in Large Language Models (LLMs) inspire the "LLM-as-a-judge" paradigm, where LLMs are leveraged to perform scoring, ranking, or selection across various tasks and applications. This paper provides a comprehensive survey of LLM-based judgment and assessment, offering an in-depth overview to advance this emerging field. We begin by giving detailed definitions from both input and output perspectives. Then we introduce a comprehensive taxonomy to explore LLM-as-a-judge from three dimensions: what to judge, how to judge and where to judge. Finally, we compile benchmarks for evaluating LLM-as-a-judge and highlight key challenges and promising directions, aiming to provide valuable insights and inspire future research in this promising research area. Paper list and more resources about LLM-as-a-judge can be found at https://github.com/llm-as-a-judge/Awesome-LLM-as-a-judge and https://llm-as-a-judge.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16594v6-abstract-full').style.display = 'none'; document.getElementById('2411.16594v6-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v6: add new citations; 36 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15354">arXiv:2411.15354</a> <span> [<a href="https://arxiv.org/pdf/2411.15354">pdf</a>, <a href="https://arxiv.org/format/2411.15354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Unified Semantic Log Parsing and Causal Graph Construction for Attack Attribution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhuoran Tan</a>, <a href="/search/cs?searchtype=author&query=Anagnostopoulos%2C+C">Christos Anagnostopoulos</a>, <a href="/search/cs?searchtype=author&query=Parambath%2C+S+P">Shameem P. Parambath</a>, <a href="/search/cs?searchtype=author&query=Singer%2C+J">Jeremy Singer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15354v1-abstract-short" style="display: inline;"> Multi-source logs provide a comprehensive overview of ongoing system activities, allowing for in-depth analysis to detect potential threats. A practical approach for threat detection involves explicit extraction of entity triples (subject, action, object) towards building provenance graphs to facilitate the analysis of system behavior. However, current log parsing methods mainly focus on retrievin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15354v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15354v1-abstract-full" style="display: none;"> Multi-source logs provide a comprehensive overview of ongoing system activities, allowing for in-depth analysis to detect potential threats. A practical approach for threat detection involves explicit extraction of entity triples (subject, action, object) towards building provenance graphs to facilitate the analysis of system behavior. However, current log parsing methods mainly focus on retrieving parameters and events from raw logs while approaches based on entity extraction are limited to processing a single type of log. To address these gaps, we contribute with a novel unified framework, coined UTLParser. UTLParser adopts semantic analysis to construct causal graphs by merging multiple sub-graphs from individual log sources in labeled log dataset. It leverages domain knowledge in threat hunting such as Points of Interest. We further explore log generation delays and provide interfaces for optimized temporal graph querying. Our experiments showcase that UTLParser overcomes drawbacks of other log parsing methods. Furthermore, UTLParser precisely extracts explicit causal threat information while being compatible with enormous downstream tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15354v1-abstract-full').style.display = 'none'; document.getElementById('2411.15354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15098">arXiv:2411.15098</a> <span> [<a href="https://arxiv.org/pdf/2411.15098">pdf</a>, <a href="https://arxiv.org/format/2411.15098">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> OminiControl: Minimal and Universal Control for Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhenxiong Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Songhua Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xingyi Yang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Q">Qiaochu Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15098v4-abstract-short" style="display: inline;"> In this paper, we introduce OminiControl, a highly versatile and parameter-efficient framework that integrates image conditions into pre-trained Diffusion Transformer (DiT) models. At its core, OminiControl leverages a parameter reuse mechanism, enabling the DiT to encode image conditions using itself as a powerful backbone and process them with its flexible multi-modal attention processors. Unlik… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15098v4-abstract-full').style.display = 'inline'; document.getElementById('2411.15098v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15098v4-abstract-full" style="display: none;"> In this paper, we introduce OminiControl, a highly versatile and parameter-efficient framework that integrates image conditions into pre-trained Diffusion Transformer (DiT) models. At its core, OminiControl leverages a parameter reuse mechanism, enabling the DiT to encode image conditions using itself as a powerful backbone and process them with its flexible multi-modal attention processors. Unlike existing methods, which rely heavily on additional encoder modules with complex architectures, OminiControl (1) effectively and efficiently incorporates injected image conditions with only ~0.1% additional parameters, and (2) addresses a wide range of image conditioning tasks in a unified manner, including subject-driven generation and spatially-aligned conditions such as edges, depth, and more. Remarkably, these capabilities are achieved by training on images generated by the DiT itself, which is particularly beneficial for subject-driven generation. Extensive evaluations demonstrate that OminiControl outperforms existing UNet-based and DiT-adapted models in both subject-driven and spatially-aligned conditional generation. Additionally, we release our training dataset, Subjects200K, a diverse collection of over 200,000 identity-consistent images, along with an efficient data synthesis pipeline to advance research in subject-consistent generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15098v4-abstract-full').style.display = 'none'; document.getElementById('2411.15098v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14829">arXiv:2411.14829</a> <span> [<a href="https://arxiv.org/pdf/2411.14829">pdf</a>, <a href="https://arxiv.org/format/2411.14829">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> OSPtrack: A Labeled Dataset Targeting Simulated Execution of Open-Source Software </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhuoran Tan</a>, <a href="/search/cs?searchtype=author&query=Anagnosstopoulos%2C+C">Christos Anagnosstopoulos</a>, <a href="/search/cs?searchtype=author&query=Singer%2C+J">Jeremy Singer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14829v2-abstract-short" style="display: inline;"> Open-source software serves as a foundation for the internet and the cyber supply chain, but its exploitation is becoming increasingly prevalent. While advances in vulnerability detection for OSS have been significant, prior research has largely focused on static code analysis, often neglecting runtime indicators. To address this shortfall, we created a comprehensive dataset spanning five ecosyste… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14829v2-abstract-full').style.display = 'inline'; document.getElementById('2411.14829v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14829v2-abstract-full" style="display: none;"> Open-source software serves as a foundation for the internet and the cyber supply chain, but its exploitation is becoming increasingly prevalent. While advances in vulnerability detection for OSS have been significant, prior research has largely focused on static code analysis, often neglecting runtime indicators. To address this shortfall, we created a comprehensive dataset spanning five ecosystems, capturing features generated during the execution of packages and libraries in isolated environments. The dataset includes 9,461 package reports, of which 1,962 are identified as malicious, and encompasses both static and dynamic features such as files, sockets, commands, and DNS records. Each report is labeled with verified information and detailed sub-labels for attack types, facilitating the identification of malicious indicators when source code is unavailable. This dataset supports runtime detection, enhances detection model training, and enables efficient comparative analysis across ecosystems, contributing to the strengthening of supply chain security. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14829v2-abstract-full').style.display = 'none'; document.getElementById('2411.14829v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11667">arXiv:2411.11667</a> <span> [<a href="https://arxiv.org/pdf/2411.11667">pdf</a>, <a href="https://arxiv.org/format/2411.11667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Representation Misalignment in Contrastive Learning via Influence Function </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+C">Chenyang Ren</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Huanyi Xie</a>, <a href="/search/cs?searchtype=author&query=Saadi%2C+K">Khouloud Saadi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Di Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11667v2-abstract-short" style="display: inline;"> Contrastive learning, commonly applied in large-scale multimodal models, often relies on data from diverse and often unreliable sources, which can include misaligned or mislabeled text-image pairs. This frequently leads to robustness issues and hallucinations, ultimately causing performance degradation. Data valuation is an efficient way to detect and trace these misalignments. Nevertheless, exist… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11667v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11667v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11667v2-abstract-full" style="display: none;"> Contrastive learning, commonly applied in large-scale multimodal models, often relies on data from diverse and often unreliable sources, which can include misaligned or mislabeled text-image pairs. This frequently leads to robustness issues and hallucinations, ultimately causing performance degradation. Data valuation is an efficient way to detect and trace these misalignments. Nevertheless, existing methods are computationally expensive for large-scale models. Although computationally efficient, classical influence functions are inadequate for contrastive learning models, as they were initially designed for pointwise loss. Furthermore, contrastive learning involves minimizing the distance between positive sample modalities while maximizing the distance between negative sample modalities. This necessitates evaluating the influence of samples from both perspectives. To tackle these challenges, we introduce the Extended Influence Function for Contrastive Loss (ECIF), an influence function crafted for contrastive loss. ECIF considers both positive and negative samples and provides a closed-form approximation of contrastive learning models, eliminating the need for retraining. Building upon ECIF, we develop a series of algorithms for data evaluation, misalignment detection, and misprediction trace-back tasks. Experimental results demonstrate our ECIF advances the transparency and interpretability of CLIP-style embedding models by offering a more accurate assessment of data impact and model alignment compared to traditional baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11667v2-abstract-full').style.display = 'none'; document.getElementById('2411.11667v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07574">arXiv:2411.07574</a> <span> [<a href="https://arxiv.org/pdf/2411.07574">pdf</a>, <a href="https://arxiv.org/format/2411.07574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Disentangling Tabular Data Towards Better One-Class Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jianan Ye</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yijie Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+G">Guangliang Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07574v2-abstract-short" style="display: inline;"> Tabular anomaly detection under the one-class classification setting poses a significant challenge, as it involves accurately conceptualizing "normal" derived exclusively from a single category to discern anomalies from normal data variations. Capturing the intrinsic correlation among attributes within normal samples presents one promising method for learning the concept. To do so, the most recent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07574v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07574v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07574v2-abstract-full" style="display: none;"> Tabular anomaly detection under the one-class classification setting poses a significant challenge, as it involves accurately conceptualizing "normal" derived exclusively from a single category to discern anomalies from normal data variations. Capturing the intrinsic correlation among attributes within normal samples presents one promising method for learning the concept. To do so, the most recent effort relies on a learnable mask strategy with a reconstruction task. However, this wisdom may suffer from the risk of producing uniform masks, i.e., essentially nothing is masked, leading to less effective correlation learning. To address this issue, we presume that attributes related to others in normal samples can be divided into two non-overlapping and correlated subsets, defined as CorrSets, to capture the intrinsic correlation effectively. Accordingly, we introduce an innovative method that disentangles CorrSets from normal tabular data. To our knowledge, this is a pioneering effort to apply the concept of disentanglement for one-class anomaly detection on tabular data. Extensive experiments on 20 tabular datasets show that our method substantially outperforms the state-of-the-art methods and leads to an average performance improvement of 6.1% on AUC-PR and 2.1% on AUC-ROC. Codes are available at https://github.com/yjnanan/Disent-AD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07574v2-abstract-full').style.display = 'none'; document.getElementById('2411.07574v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06363">arXiv:2411.06363</a> <span> [<a href="https://arxiv.org/pdf/2411.06363">pdf</a>, <a href="https://arxiv.org/format/2411.06363">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Layer-Wise Feature Metric of Semantic-Pixel Matching for Few-Shot Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+H">Hao Tang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Junhao Lu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guoheng Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Ming Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuhang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+G">Guo Zhong</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhengguang Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zinuo Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06363v1-abstract-short" style="display: inline;"> In Few-Shot Learning (FSL), traditional metric-based approaches often rely on global metrics to compute similarity. However, in natural scenes, the spatial arrangement of key instances is often inconsistent across images. This spatial misalignment can result in mismatched semantic pixels, leading to inaccurate similarity measurements. To address this issue, we propose a novel method called the Lay… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06363v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06363v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06363v1-abstract-full" style="display: none;"> In Few-Shot Learning (FSL), traditional metric-based approaches often rely on global metrics to compute similarity. However, in natural scenes, the spatial arrangement of key instances is often inconsistent across images. This spatial misalignment can result in mismatched semantic pixels, leading to inaccurate similarity measurements. To address this issue, we propose a novel method called the Layer-Wise Features Metric of Semantic-Pixel Matching (LWFM-SPM) to make finer comparisons. Our method enhances model performance through two key modules: (1) the Layer-Wise Embedding (LWE) Module, which refines the cross-correlation of image pairs to generate well-focused feature maps for each layer; (2)the Semantic-Pixel Matching (SPM) Module, which aligns critical pixels based on semantic embeddings using an assignment algorithm. We conducted extensive experiments to evaluate our method on four widely used few-shot classification benchmarks: miniImageNet, tieredImageNet, CUB-200-2011, and CIFAR-FS. The results indicate that LWFM-SPM achieves competitive performance across these benchmarks. Our code will be publicly available on https://github.com/Halo2Tang/Code-for-LWFM-SPM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06363v1-abstract-full').style.display = 'none'; document.getElementById('2411.06363v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06106">arXiv:2411.06106</a> <span> [<a href="https://arxiv.org/pdf/2411.06106">pdf</a>, <a href="https://arxiv.org/format/2411.06106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personalize to generalize: Towards a universal medical multi-modality generalization through personalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xi Yang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tan Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+C">Chen Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xin Guo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiufeng Wang</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+A">Anh Nguyen</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yuan Qi</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yuan Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06106v2-abstract-short" style="display: inline;"> The differences among medical imaging modalities, driven by distinct underlying principles, pose significant challenges for generalization in multi-modal medical tasks. Beyond modality gaps, individual variations, such as differences in organ size and metabolic rate, further impede a model's ability to generalize effectively across both modalities and diverse populations. Despite the importance of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06106v2-abstract-full').style.display = 'inline'; document.getElementById('2411.06106v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06106v2-abstract-full" style="display: none;"> The differences among medical imaging modalities, driven by distinct underlying principles, pose significant challenges for generalization in multi-modal medical tasks. Beyond modality gaps, individual variations, such as differences in organ size and metabolic rate, further impede a model's ability to generalize effectively across both modalities and diverse populations. Despite the importance of personalization, existing approaches to multi-modal generalization often neglect individual differences, focusing solely on common anatomical features. This limitation may result in weakened generalization in various medical tasks. In this paper, we unveil that personalization is critical for multi-modal generalization. Specifically, we propose an approach to achieve personalized generalization through approximating the underlying personalized invariant representation ${X}_h$ across various modalities by leveraging individual-level constraints and a learnable biological prior. We validate the feasibility and benefits of learning a personalized ${X}_h$, showing that this representation is highly generalizable and transferable across various multi-modal medical tasks. Extensive experimental results consistently show that the additionally incorporated personalization significantly improves performance and generalization across diverse scenarios, confirming its effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06106v2-abstract-full').style.display = 'none'; document.getElementById('2411.06106v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03284">arXiv:2411.03284</a> <span> [<a href="https://arxiv.org/pdf/2411.03284">pdf</a>, <a href="https://arxiv.org/format/2411.03284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SMoA: Improving Multi-agent Large Language Models with Sparse Mixture-of-Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+D">Dawei Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+P">Peijia Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifan Li</a>, <a href="/search/cs?searchtype=author&query=Chaudhary%2C+K+S">Kumar Satvik Chaudhary</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lijie Hu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiayi Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03284v1-abstract-short" style="display: inline;"> While multi-agent systems have been shown to significantly enhance the performance of Large Language Models (LLMs) across various tasks and applications, the dense interaction between scaling agents potentially hampers their efficiency and diversity. To address these challenges, we draw inspiration from the sparse mixture-of-agents (SMoE) and propose a sparse mixture-of-agents (SMoA) framework to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03284v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03284v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03284v1-abstract-full" style="display: none;"> While multi-agent systems have been shown to significantly enhance the performance of Large Language Models (LLMs) across various tasks and applications, the dense interaction between scaling agents potentially hampers their efficiency and diversity. To address these challenges, we draw inspiration from the sparse mixture-of-agents (SMoE) and propose a sparse mixture-of-agents (SMoA) framework to improve the efficiency and diversity of multi-agent LLMs. Unlike completely connected structures, SMoA introduces novel Response Selection and Early Stopping mechanisms to sparsify information flows among individual LLM agents, striking a balance between performance and efficiency. Additionally, inspired by the expert diversity principle in SMoE frameworks for workload balance between experts, we assign distinct role descriptions to each LLM agent, fostering diverse and divergent thinking. Extensive experiments on reasoning, alignment, and fairness benchmarks demonstrate that SMoA achieves performance comparable to traditional mixture-of-agents approaches but with significantly lower computational costs. Further analysis reveals that SMoA is more stable, has a greater capacity to scale, and offers considerable potential through hyper-parameter optimization. Code and data will be available at: https://github.com/David-Li0406/SMoA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03284v1-abstract-full').style.display = 'none'; document.getElementById('2411.03284v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01172">arXiv:2411.01172</a> <span> [<a href="https://arxiv.org/pdf/2411.01172">pdf</a>, <a href="https://arxiv.org/format/2411.01172">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Covariance-based Space Regularization for Few-shot Class Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yijie Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Guanyu Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaorui Tan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaowei Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kaizhu Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qiu-Feng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01172v1-abstract-short" style="display: inline;"> Few-shot Class Incremental Learning (FSCIL) presents a challenging yet realistic scenario, which requires the model to continually learn new classes with limited labeled data (i.e., incremental sessions) while retaining knowledge of previously learned base classes (i.e., base sessions). Due to the limited data in incremental sessions, models are prone to overfitting new classes and suffering catas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01172v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01172v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01172v1-abstract-full" style="display: none;"> Few-shot Class Incremental Learning (FSCIL) presents a challenging yet realistic scenario, which requires the model to continually learn new classes with limited labeled data (i.e., incremental sessions) while retaining knowledge of previously learned base classes (i.e., base sessions). Due to the limited data in incremental sessions, models are prone to overfitting new classes and suffering catastrophic forgetting of base classes. To tackle these issues, recent advancements resort to prototype-based approaches to constrain the base class distribution and learn discriminative representations of new classes. Despite the progress, the limited data issue still induces ill-divided feature space, leading the model to confuse the new class with old classes or fail to facilitate good separation among new classes. In this paper, we aim to mitigate these issues by directly constraining the span of each class distribution from a covariance perspective. In detail, we propose a simple yet effective covariance constraint loss to force the model to learn each class distribution with the same covariance matrix. In addition, we propose a perturbation approach to perturb the few-shot training samples in the feature space, which encourages the samples to be away from the weighted distribution of other classes. Regarding perturbed samples as new class data, the classifier is forced to establish explicit boundaries between each new class and the existing ones. Our approach is easy to integrate into existing FSCIL approaches to boost performance. Experiments on three benchmarks validate the effectiveness of our approach, achieving a new state-of-the-art performance of FSCIL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01172v1-abstract-full').style.display = 'none'; document.getElementById('2411.01172v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">WACV2025,10 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00788">arXiv:2411.00788</a> <span> [<a href="https://arxiv.org/pdf/2411.00788">pdf</a>, <a href="https://arxiv.org/format/2411.00788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> KeyInst: Keyword Instruction for Improving SQL Formulation in Text-to-SQL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiping Liu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhao Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00788v1-abstract-short" style="display: inline;"> Text-to-SQL parsing involves the translation of natural language queries (NLQs) into their corresponding SQL commands. A principal challenge within this domain is the formulation of SQL queries that are not only syntactically correct but also semantically aligned with the natural language input. However, the intrinsic disparity between the NLQ and the SQL poses a significant challenge. In this res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00788v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00788v1-abstract-full" style="display: none;"> Text-to-SQL parsing involves the translation of natural language queries (NLQs) into their corresponding SQL commands. A principal challenge within this domain is the formulation of SQL queries that are not only syntactically correct but also semantically aligned with the natural language input. However, the intrinsic disparity between the NLQ and the SQL poses a significant challenge. In this research, we introduce Keyword Instruction (KeyInst), a novel method designed to enhance SQL formulation by Large Language Models (LLMs). KeyInst essentially provides guidance on pivotal SQL keywords likely to be part of the final query, thus facilitates a smoother SQL query formulation process. We explore two strategies for integrating KeyInst into Text-to-SQL parsing: a pipeline strategy and a single-pass strategy. The former first generates KeyInst for question, which are then used to prompt LLMs. The latter employs a fine-tuned model to concurrently generate KeyInst and SQL in one step. We developed StrucQL, a benchmark specifically designed for the evaluation of SQL formulation. Extensive experiments on StrucQL and other benchmarks demonstrate that KeyInst significantly improves upon the existing Text-to-SQL prompting techniques. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00788v1-abstract-full').style.display = 'none'; document.getElementById('2411.00788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23563">arXiv:2410.23563</a> <span> [<a href="https://arxiv.org/pdf/2410.23563">pdf</a>, <a href="https://arxiv.org/format/2410.23563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Across-Platform Detection of Malicious Cryptocurrency Transactions via Account Interaction Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Che%2C+Z">Zheng Che</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+M">Meng Shen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhehui Tan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hanbiao Du</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Liehuang Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qinglin Zhao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yong Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23563v1-abstract-short" style="display: inline;"> With the rapid evolution of Web3.0, cryptocurrency has become a cornerstone of decentralized finance. While these digital assets enable efficient and borderless financial transactions, their pseudonymous nature has also attracted malicious activities such as money laundering, fraud, and other financial crimes. Effective detection of malicious transactions is crucial to maintaining the security and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23563v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23563v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23563v1-abstract-full" style="display: none;"> With the rapid evolution of Web3.0, cryptocurrency has become a cornerstone of decentralized finance. While these digital assets enable efficient and borderless financial transactions, their pseudonymous nature has also attracted malicious activities such as money laundering, fraud, and other financial crimes. Effective detection of malicious transactions is crucial to maintaining the security and integrity of the Web 3.0 ecosystem. Existing malicious transaction detection methods rely on large amounts of labeled data and suffer from low generalization. Label-efficient and generalizable malicious transaction detection remains a challenging task. In this paper, we propose ShadowEyes, a novel malicious transaction detection method. Specifically, we first propose a generalized graph structure named TxGraph as a representation of malicious transaction, which captures the interaction features of each malicious account and its neighbors. Then we carefully design a data augmentation method tailored to simulate the evolution of malicious transactions to generate positive pairs. To alleviate account label scarcity, we further design a graph contrastive mechanism, which enables ShadowEyes to learn discriminative features effectively from unlabeled data, thereby enhancing its detection capabilities in real-world scenarios. We conduct extensive experiments using public datasets to evaluate the performance of ShadowEyes. The results demonstrate that it outperforms state-of-the-art (SOTA) methods in four typical scenarios. Specifically, in the zero-shot learning scenario, it can achieve an F1 score of 76.98% for identifying gambling transactions, surpassing the SOTA method by12.05%. In the scenario of across-platform malicious transaction detection, ShadowEyes maintains an F1 score of around 90%, which is 10% higher than the SOTA method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23563v1-abstract-full').style.display = 'none'; document.getElementById('2410.23563v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22551">arXiv:2410.22551</a> <span> [<a href="https://arxiv.org/pdf/2410.22551">pdf</a>, <a href="https://arxiv.org/format/2410.22551">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FairSkin: Fair Diffusion for Skin Disease Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuguang Yao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiming Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jingtong Hu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Sijia Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22551v2-abstract-short" style="display: inline;"> Image generation is a prevailing technique for clinical data augmentation for advancing diagnostic accuracy and reducing healthcare disparities. Diffusion Model (DM) has become a leading method in generating synthetic medical images, but it suffers from a critical twofold bias: (1) The quality of images generated for Caucasian individuals is significantly higher, as measured by the Frechet Incepti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22551v2-abstract-full').style.display = 'inline'; document.getElementById('2410.22551v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22551v2-abstract-full" style="display: none;"> Image generation is a prevailing technique for clinical data augmentation for advancing diagnostic accuracy and reducing healthcare disparities. Diffusion Model (DM) has become a leading method in generating synthetic medical images, but it suffers from a critical twofold bias: (1) The quality of images generated for Caucasian individuals is significantly higher, as measured by the Frechet Inception Distance (FID). (2) The ability of the downstream-task learner to learn critical features from disease images varies across different skin tones. These biases pose significant risks, particularly in skin disease detection, where underrepresentation of certain skin tones can lead to misdiagnosis or neglect of specific conditions. To address these challenges, we propose FairSkin, a novel DM framework that mitigates these biases through a three-level resampling mechanism, ensuring fairer representation across racial and disease categories. Our approach significantly improves the diversity and quality of generated images, contributing to more equitable skin disease detection in clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22551v2-abstract-full').style.display = 'none'; document.getElementById('2410.22551v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22108">arXiv:2410.22108</a> <span> [<a href="https://arxiv.org/pdf/2410.22108">pdf</a>, <a href="https://arxiv.org/format/2410.22108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Protecting Privacy in Multimodal Large Language Models with MLLMU-Bench </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+G">Guangyao Dou</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+M">Mengzhao Jia</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingkai Zeng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yongle Yuan</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22108v1-abstract-short" style="display: inline;"> Generative models such as Large Language Models (LLM) and Multimodal Large Language models (MLLMs) trained on massive web corpora can memorize and disclose individuals' confidential and private data, raising legal and ethical concerns. While many previous works have addressed this issue in LLM via machine unlearning, it remains largely unexplored for MLLMs. To tackle this challenge, we introduce M… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22108v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22108v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22108v1-abstract-full" style="display: none;"> Generative models such as Large Language Models (LLM) and Multimodal Large Language models (MLLMs) trained on massive web corpora can memorize and disclose individuals' confidential and private data, raising legal and ethical concerns. While many previous works have addressed this issue in LLM via machine unlearning, it remains largely unexplored for MLLMs. To tackle this challenge, we introduce Multimodal Large Language Model Unlearning Benchmark (MLLMU-Bench), a novel benchmark aimed at advancing the understanding of multimodal machine unlearning. MLLMU-Bench consists of 500 fictitious profiles and 153 profiles for public celebrities, each profile feature over 14 customized question-answer pairs, evaluated from both multimodal (image+text) and unimodal (text) perspectives. The benchmark is divided into four sets to assess unlearning algorithms in terms of efficacy, generalizability, and model utility. Finally, we provide baseline results using existing generative model unlearning algorithms. Surprisingly, our experiments show that unimodal unlearning algorithms excel in generation and cloze tasks, while multimodal unlearning approaches perform better in classification tasks with multimodal inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22108v1-abstract-full').style.display = 'none'; document.getElementById('2410.22108v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20105">arXiv:2410.20105</a> <span> [<a href="https://arxiv.org/pdf/2410.20105">pdf</a>, <a href="https://arxiv.org/format/2410.20105">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> FedSSP: Federated Graph Learning with Spectral Knowledge and Personalized Preference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zihan Tan</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+G">Guancheng Wan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenke Huang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+M">Mang Ye</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20105v1-abstract-short" style="display: inline;"> Personalized Federated Graph Learning (pFGL) facilitates the decentralized training of Graph Neural Networks (GNNs) without compromising privacy while accommodating personalized requirements for non-IID participants. In cross-domain scenarios, structural heterogeneity poses significant challenges for pFGL. Nevertheless, previous pFGL methods incorrectly share non-generic knowledge globally and fai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20105v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20105v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20105v1-abstract-full" style="display: none;"> Personalized Federated Graph Learning (pFGL) facilitates the decentralized training of Graph Neural Networks (GNNs) without compromising privacy while accommodating personalized requirements for non-IID participants. In cross-domain scenarios, structural heterogeneity poses significant challenges for pFGL. Nevertheless, previous pFGL methods incorrectly share non-generic knowledge globally and fail to tailor personalized solutions locally under domain structural shift. We innovatively reveal that the spectral nature of graphs can well reflect inherent domain structural shifts. Correspondingly, our method overcomes it by sharing generic spectral knowledge. Moreover, we indicate the biased message-passing schemes for graph structures and propose the personalized preference module. Combining both strategies, we propose our pFGL framework FedSSP which Shares generic Spectral knowledge while satisfying graph Preferences. Furthermore, We perform extensive experiments on cross-dataset and cross-domain settings to demonstrate the superiority of our framework. The code is available at https://github.com/OakleyTan/FedSSP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20105v1-abstract-full').style.display = 'none'; document.getElementById('2410.20105v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19246">arXiv:2410.19246</a> <span> [<a href="https://arxiv.org/pdf/2410.19246">pdf</a>, <a href="https://arxiv.org/format/2410.19246">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> </div> </div> <p class="title is-5 mathjax"> Paths and Intersections: Characterization of Quasi-metrics in Directed Okamura-Seymour Instances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zihan Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19246v1-abstract-short" style="display: inline;"> We study the following distance realization problem. Given a quasi-metric $D$ on a set $T$ of terminals, does there exist a directed Okamura-Seymour graph that realizes $D$ as the (directed) shortest-path distance metric on $T$? We show that, if we are further given the circular ordering of terminals lying on the boundary, then Monge property is a sufficient and necessary condition. This generaliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19246v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19246v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19246v1-abstract-full" style="display: none;"> We study the following distance realization problem. Given a quasi-metric $D$ on a set $T$ of terminals, does there exist a directed Okamura-Seymour graph that realizes $D$ as the (directed) shortest-path distance metric on $T$? We show that, if we are further given the circular ordering of terminals lying on the boundary, then Monge property is a sufficient and necessary condition. This generalizes previous results for undirected Okamura-Seymour instances. With the circular ordering, we give a greedy algorithm for constructing a directed Okamura-Seymour instance that realizes the input quasi-metric. The algorithm takes the dual perspective concerning flows and routings, and is based on a new way of analyzing graph structures, by viewing graphs as \emph{paths and their intersections}. We believe this new understanding is of independent interest and will prove useful in other problems in graph theory and graph algorithms. We also design an efficient algorithm for finding such a circular ordering that makes $D$ satisfy Monge property, if one exists. Combined with our result above, this gives an efficient algorithm for the distance realization problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19246v1-abstract-full').style.display = 'none'; document.getElementById('2410.19246v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16983">arXiv:2410.16983</a> <span> [<a href="https://arxiv.org/pdf/2410.16983">pdf</a>, <a href="https://arxiv.org/format/2410.16983">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Order Matters: Exploring Order Sensitivity in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhijie Tan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Weiping Li</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+T">Tong Mo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16983v1-abstract-short" style="display: inline;"> Multimodal Large Language Models (MLLMs) utilize multimodal contexts consisting of text, images, or videos to solve various multimodal tasks. However, we find that changing the order of multimodal input can cause the model's performance to fluctuate between advanced performance and random guessing. This phenomenon exists in both single-modality (text-only or image-only) and mixed-modality (image-t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16983v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16983v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16983v1-abstract-full" style="display: none;"> Multimodal Large Language Models (MLLMs) utilize multimodal contexts consisting of text, images, or videos to solve various multimodal tasks. However, we find that changing the order of multimodal input can cause the model's performance to fluctuate between advanced performance and random guessing. This phenomenon exists in both single-modality (text-only or image-only) and mixed-modality (image-text-pair) contexts. Furthermore, we demonstrate that popular MLLMs pay special attention to certain multimodal context positions, particularly the beginning and end. Leveraging this special attention, we place key video frames and important image/text content in special positions within the context and submit them to the MLLM for inference. This method results in average performance gains of 14.7% for video-caption matching and 17.8% for visual question answering tasks. Additionally, we propose a new metric, Position-Invariant Accuracy (PIA), to address order bias in MLLM evaluation. Our research findings contribute to a better understanding of Multi-Modal In-Context Learning (MMICL) and provide practical strategies for enhancing MLLM performance without increasing computational costs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16983v1-abstract-full').style.display = 'none'; document.getElementById('2410.16983v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14141">arXiv:2410.14141</a> <span> [<a href="https://arxiv.org/pdf/2410.14141">pdf</a>, <a href="https://arxiv.org/format/2410.14141">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Coherence-Driven Multimodal Safety Dialogue with Active Learning for Embodied Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hassan%2C+S">Sabit Hassan</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H">Hye-Young Chung</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+X+Z">Xiang Zhi Tan</a>, <a href="/search/cs?searchtype=author&query=Alikhani%2C+M">Malihe Alikhani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14141v1-abstract-short" style="display: inline;"> When assisting people in daily tasks, robots need to accurately interpret visual cues and respond effectively in diverse safety-critical situations, such as sharp objects on the floor. In this context, we present M-CoDAL, a multimodal-dialogue system specifically designed for embodied agents to better understand and communicate in safety-critical situations. The system leverages discourse coherenc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14141v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14141v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14141v1-abstract-full" style="display: none;"> When assisting people in daily tasks, robots need to accurately interpret visual cues and respond effectively in diverse safety-critical situations, such as sharp objects on the floor. In this context, we present M-CoDAL, a multimodal-dialogue system specifically designed for embodied agents to better understand and communicate in safety-critical situations. The system leverages discourse coherence relations to enhance its contextual understanding and communication abilities. To train this system, we introduce a novel clustering-based active learning mechanism that utilizes an external Large Language Model (LLM) to identify informative instances. Our approach is evaluated using a newly created multimodal dataset comprising 1K safety violations extracted from 2K Reddit images. These violations are annotated using a Large Multimodal Model (LMM) and verified by human annotators. Results with this dataset demonstrate that our approach improves resolution of safety situations, user sentiment, as well as safety of the conversation. Next, we deploy our dialogue system on a Hello Robot Stretch robot and conduct a within-subject user study with real-world participants. In the study, participants role-play two safety scenarios with different levels of severity with the robot and receive interventions from our model and a baseline system powered by OpenAI's ChatGPT. The study results corroborate and extend the findings from automated evaluation, showing that our proposed system is more persuasive and competent in a real-world embodied agent setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14141v1-abstract-full').style.display = 'none'; document.getElementById('2410.14141v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12934">arXiv:2410.12934</a> <span> [<a href="https://arxiv.org/pdf/2410.12934">pdf</a>, <a href="https://arxiv.org/format/2410.12934">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Mathematical Reasoning in LLMs by Stepwise Correction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhenyu Wu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingkai Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhaoxuan Tan</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chao Shen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+M">Meng Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12934v1-abstract-short" style="display: inline;"> Best-of-N decoding methods instruct large language models (LLMs) to generate multiple solutions, score each using a scoring function, and select the highest scored as the final answer to mathematical reasoning problems. However, this repeated independent process often leads to the same mistakes, making the selected solution still incorrect. We propose a novel prompting method named Stepwise Correc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12934v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12934v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12934v1-abstract-full" style="display: none;"> Best-of-N decoding methods instruct large language models (LLMs) to generate multiple solutions, score each using a scoring function, and select the highest scored as the final answer to mathematical reasoning problems. However, this repeated independent process often leads to the same mistakes, making the selected solution still incorrect. We propose a novel prompting method named Stepwise Correction (StepCo) that helps LLMs identify and revise incorrect steps in their generated reasoning paths. It iterates verification and revision phases that employ a process-supervised verifier. The verify-then-revise process not only improves answer correctness but also reduces token consumption with fewer paths needed to generate. With StepCo, a series of LLMs demonstrate exceptional performance. Notably, using GPT-4o as the backend LLM, StepCo achieves an average accuracy of 94.1 across eight datasets, significantly outperforming the state-of-the-art Best-of-N method by +2.4, while reducing token consumption by 77.8%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12934v1-abstract-full').style.display = 'none'; document.getElementById('2410.12934v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">under review</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+Z&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository