CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,776 results for author: <span class="mathjax">Lin, Y</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Lin%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Lin, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Lin%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Lin, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14866">arXiv:2502.14866</a> <span> [<a href="https://arxiv.org/pdf/2502.14866">pdf</a>, <a href="https://arxiv.org/format/2502.14866">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> LServe: Efficient Long-sequence LLM Serving with Unified Sparse Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shang Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junxian Guo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haotian Tang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghao Hu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+G">Guangxuan Xiao</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jiaming Tang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujun Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhijian Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yao Lu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14866v1-abstract-short" style="display: inline;"> Large language models (LLMs) have shown remarkable potential in processing long sequences, yet efficiently serving these long-context models remains challenging due to the quadratic computational complexity of attention in the prefilling stage and the large memory footprint of the KV cache in the decoding stage. To address these issues, we introduce LServe, an efficient system that accelerates lon… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14866v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14866v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14866v1-abstract-full" style="display: none;"> Large language models (LLMs) have shown remarkable potential in processing long sequences, yet efficiently serving these long-context models remains challenging due to the quadratic computational complexity of attention in the prefilling stage and the large memory footprint of the KV cache in the decoding stage. To address these issues, we introduce LServe, an efficient system that accelerates long-sequence LLM serving via hybrid sparse attention. This method unifies different hardware-friendly, structured sparsity patterns for both prefilling and decoding attention into a single framework, where computations on less important tokens are skipped block-wise. LServe demonstrates the compatibility of static and dynamic sparsity in long-context LLM attention. This design enables multiplicative speedups by combining these optimizations. Specifically, we convert half of the attention heads to nearly free streaming heads in both the prefilling and decoding stages. Additionally, we find that only a constant number of KV pages is required to preserve long-context capabilities, irrespective of context length. We then design a hierarchical KV page selection policy that dynamically prunes KV pages based on query-centric similarity. On average, LServe accelerates LLM prefilling by up to 2.9x and decoding by 1.3-2.1x over vLLM, maintaining long-context accuracy. Code is released at https://github.com/mit-han-lab/omniserve. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14866v1-abstract-full').style.display = 'none'; document.getElementById('2502.14866v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by MLSys 2025. Code available at: https://github.com/mit-han-lab/omniserve</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14847">arXiv:2502.14847</a> <span> [<a href="https://arxiv.org/pdf/2502.14847">pdf</a>, <a href="https://arxiv.org/format/2502.14847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Red-Teaming LLM Multi-Agent Systems via Communication Attacks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+P">Pengfei He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yupin Lin</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+S">Shen Dong</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Han Xu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+Y">Yue Xing</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14847v1-abstract-short" style="display: inline;"> Large Language Model-based Multi-Agent Systems (LLM-MAS) have revolutionized complex problem-solving capability by enabling sophisticated agent collaboration through message-based communications. While the communication framework is crucial for agent coordination, it also introduces a critical yet unexplored security vulnerability. In this work, we introduce Agent-in-the-Middle (AiTM), a novel att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14847v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14847v1-abstract-full" style="display: none;"> Large Language Model-based Multi-Agent Systems (LLM-MAS) have revolutionized complex problem-solving capability by enabling sophisticated agent collaboration through message-based communications. While the communication framework is crucial for agent coordination, it also introduces a critical yet unexplored security vulnerability. In this work, we introduce Agent-in-the-Middle (AiTM), a novel attack that exploits the fundamental communication mechanisms in LLM-MAS by intercepting and manipulating inter-agent messages. Unlike existing attacks that compromise individual agents, AiTM demonstrates how an adversary can compromise entire multi-agent systems by only manipulating the messages passing between agents. To enable the attack under the challenges of limited control and role-restricted communication format, we develop an LLM-powered adversarial agent with a reflection mechanism that generates contextually-aware malicious instructions. Our comprehensive evaluation across various frameworks, communication structures, and real-world applications demonstrates that LLM-MAS is vulnerable to communication-based attacks, highlighting the need for robust security measures in multi-agent systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14847v1-abstract-full').style.display = 'none'; document.getElementById('2502.14847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14328">arXiv:2502.14328</a> <span> [<a href="https://arxiv.org/pdf/2502.14328">pdf</a>, <a href="https://arxiv.org/format/2502.14328">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> SolSearch: An LLM-Driven Framework for Efficient SAT-Solving Code Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+J">Junjie Sheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yanqiu Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiehao Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanhong Huang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jianqi Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14328v1-abstract-short" style="display: inline;"> The Satisfiability (SAT) problem is a core challenge with significant applications in software engineering, including automated testing, configuration management, and program verification. This paper presents SolSearch, a novel framework that harnesses large language models (LLMs) to discover and optimize SAT-solving strategies automatically. Leveraging a curriculum-based, trial-and-error process,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14328v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14328v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14328v1-abstract-full" style="display: none;"> The Satisfiability (SAT) problem is a core challenge with significant applications in software engineering, including automated testing, configuration management, and program verification. This paper presents SolSearch, a novel framework that harnesses large language models (LLMs) to discover and optimize SAT-solving strategies automatically. Leveraging a curriculum-based, trial-and-error process, SolSearch enables the LLM to iteratively modify and generate SAT solver code, thereby improving solving efficiency and performance. This automated SAT-solving paradigm has the advantage of being plug-and-play, allowing integration with any SAT solver and accelerating the development or design process of new SAT solvers (new methods). Our preliminary experimental results are encouraging by demonstrating that the LLM-powered paradigm improves state-of-the-art SAT solvers on general SAT benchmarks and significantly enhances the performance of the widely used Z3 solver (11\% on PAR-2 score). These results highlight the potential for using LLM-driven methods to advance solver adaptability and effectiveness in real-world software engineering challenges. Future research directions are discussed to further refine and validate this approach, offering a promising avenue for integrating AI with traditional software engineering tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14328v1-abstract-full').style.display = 'none'; document.getElementById('2502.14328v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.14080">arXiv:2502.14080</a> <span> [<a href="https://arxiv.org/pdf/2502.14080">pdf</a>, <a href="https://arxiv.org/format/2502.14080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Personalized Education with Generative AI and Digital Twins: VR, RAG, and Zero-Shot Sentiment Analysis for Industry 4.0 Workforce Development </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Petal%2C+K">Karan Petal</a>, <a href="/search/cs?searchtype=author&query=Alhamadah%2C+A+H">Ahmed H Alhamadah</a>, <a href="/search/cs?searchtype=author&query=Ghimire%2C+S">Sujan Ghimire</a>, <a href="/search/cs?searchtype=author&query=Redondo%2C+M+W">Matthew William Redondo</a>, <a href="/search/cs?searchtype=author&query=Corona%2C+D+R+V">David Rafael Vidal Corona</a>, <a href="/search/cs?searchtype=author&query=Pacheco%2C+J">Jesus Pacheco</a>, <a href="/search/cs?searchtype=author&query=Salehi%2C+S">Soheil Salehi</a>, <a href="/search/cs?searchtype=author&query=Satam%2C+P">Pratik Satam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.14080v1-abstract-short" style="display: inline;"> The Fourth Industrial Revolution (4IR) technologies, such as cloud computing, machine learning, and AI, have improved productivity but introduced challenges in workforce training and reskilling. This is critical given existing workforce shortages, especially in marginalized communities like Underrepresented Minorities (URM), who often lack access to quality education. Addressing these challenges,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14080v1-abstract-full').style.display = 'inline'; document.getElementById('2502.14080v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.14080v1-abstract-full" style="display: none;"> The Fourth Industrial Revolution (4IR) technologies, such as cloud computing, machine learning, and AI, have improved productivity but introduced challenges in workforce training and reskilling. This is critical given existing workforce shortages, especially in marginalized communities like Underrepresented Minorities (URM), who often lack access to quality education. Addressing these challenges, this research presents gAI-PT4I4, a Generative AI-based Personalized Tutor for Industrial 4.0, designed to personalize 4IR experiential learning. gAI-PT4I4 employs sentiment analysis to assess student comprehension, leveraging generative AI and finite automaton to tailor learning experiences. The framework integrates low-fidelity Digital Twins for VR-based training, featuring an Interactive Tutor - a generative AI assistant providing real-time guidance via audio and text. It uses zero-shot sentiment analysis with LLMs and prompt engineering, achieving 86\% accuracy in classifying student-teacher interactions as positive or negative. Additionally, retrieval-augmented generation (RAG) enables personalized learning content grounded in domain-specific knowledge. To adapt training dynamically, finite automaton structures exercises into states of increasing difficulty, requiring 80\% task-performance accuracy for progression. Experimental evaluation with 22 volunteers showed improved accuracy exceeding 80\%, reducing training time. Finally, this paper introduces a Multi-Fidelity Digital Twin model, aligning Digital Twin complexity with Bloom's Taxonomy and Kirkpatrick's model, providing a scalable educational framework. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.14080v1-abstract-full').style.display = 'none'; document.getElementById('2502.14080v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.13016">arXiv:2502.13016</a> <span> [<a href="https://arxiv.org/pdf/2502.13016">pdf</a>, <a href="https://arxiv.org/format/2502.13016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LLM-Powered Proactive Data Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeighami%2C+S">Sepanta Zeighami</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiming Lin</a>, <a href="/search/cs?searchtype=author&query=Shankar%2C+S">Shreya Shankar</a>, <a href="/search/cs?searchtype=author&query=Parameswaran%2C+A">Aditya Parameswaran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.13016v1-abstract-short" style="display: inline;"> With the power of LLMs, we now have the ability to query data that was previously impossible to query, including text, images, and video. However, despite this enormous potential, most present-day data systems that leverage LLMs are reactive, reflecting our community's desire to map LLMs to known abstractions. Most data systems treat LLMs as an opaque black box that operates on user inputs and dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13016v1-abstract-full').style.display = 'inline'; document.getElementById('2502.13016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.13016v1-abstract-full" style="display: none;"> With the power of LLMs, we now have the ability to query data that was previously impossible to query, including text, images, and video. However, despite this enormous potential, most present-day data systems that leverage LLMs are reactive, reflecting our community's desire to map LLMs to known abstractions. Most data systems treat LLMs as an opaque black box that operates on user inputs and data as is, optimizing them much like any other approximate, expensive UDFs, in conjunction with other relational operators. Such data systems do as they are told, but fail to understand and leverage what the LLM is being asked to do (i.e. the underlying operations, which may be error-prone), the data the LLM is operating on (e.g., long, complex documents), or what the user really needs. They don't take advantage of the characteristics of the operations and/or the data at hand, or ensure correctness of results when there are imprecisions and ambiguities. We argue that data systems instead need to be proactive: they need to be given more agency -- armed with the power of LLMs -- to understand and rework the user inputs and the data and to make decisions on how the operations and the data should be represented and processed. By allowing the data system to parse, rewrite, and decompose user inputs and data, or to interact with the user in ways that go beyond the standard single-shot query-result paradigm, the data system is able to address user needs more efficiently and effectively. These new capabilities lead to a rich design space where the data system takes more initiative: they are empowered to perform optimization based on the transformation operations, data characteristics, and user intent. We discuss various successful examples of how this framework has been and can be applied in real-world tasks, and present future directions for this ambitious research agenda. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.13016v1-abstract-full').style.display = 'none'; document.getElementById('2502.13016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Data Engineering Bulletin March 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12820">arXiv:2502.12820</a> <span> [<a href="https://arxiv.org/pdf/2502.12820">pdf</a>, <a href="https://arxiv.org/format/2502.12820">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Atomic Smart Contract Interoperability with High Efficiency via Cross-Chain Integrated Execution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+C">Chaoyue Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingzhe Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jin Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You Lin</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Q">Qingsong Wei</a>, <a href="/search/cs?searchtype=author&query=Goh%2C+S+M+R">Siow Mong Rick Goh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12820v1-abstract-short" style="display: inline;"> With the development of Ethereum, numerous blockchains compatible with Ethereum's execution environment (i.e., Ethereum Virtual Machine, EVM) have emerged. Developers can leverage smart contracts to run various complex decentralized applications on top of blockchains. However, the increasing number of EVM-compatible blockchains has introduced significant challenges in cross-chain interoperability,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12820v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12820v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12820v1-abstract-full" style="display: none;"> With the development of Ethereum, numerous blockchains compatible with Ethereum's execution environment (i.e., Ethereum Virtual Machine, EVM) have emerged. Developers can leverage smart contracts to run various complex decentralized applications on top of blockchains. However, the increasing number of EVM-compatible blockchains has introduced significant challenges in cross-chain interoperability, particularly in ensuring efficiency and atomicity for the whole cross-chain application. Existing solutions are either limited in guaranteeing overall atomicity for the cross-chain application, or inefficient due to the need for multiple rounds of cross-chain smart contract execution. To address this gap, we propose IntegrateX, an efficient cross-chain interoperability system that ensures the overall atomicity of cross-chain smart contract invocations. The core idea is to deploy the logic required for cross-chain execution onto a single blockchain, where it can be executed in an integrated manner. This allows cross-chain applications to perform all cross-chain logic efficiently within the same blockchain. IntegrateX consists of a cross-chain smart contract deployment protocol and a cross-chain smart contract integrated execution protocol. The former achieves efficient and secure cross-chain deployment by decoupling smart contract logic from state, and employing an off-chain cross-chain deployment mechanism combined with on-chain cross-chain verification. The latter ensures atomicity of cross-chain invocations through a 2PC-based mechanism, and enhances performance through transaction aggregation and fine-grained state lock. We implement a prototype of IntegrateX. Extensive experiments demonstrate that it reduces up to 61.2% latency compared to the state-of-the-art baseline while maintaining low gas consumption. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12820v1-abstract-full').style.display = 'none'; document.getElementById('2502.12820v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission to IEEE Transactions on Parallel and Distributed Systems</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12640">arXiv:2502.12640</a> <span> [<a href="https://arxiv.org/pdf/2502.12640">pdf</a>, <a href="https://arxiv.org/format/2502.12640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RecDreamer: Consistent Text-to-3D Generation via Uniform Score Distillation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chenxi Zheng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yihong Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bangzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xuemiao Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+Y">Yongwei Nie</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengfeng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12640v1-abstract-short" style="display: inline;"> Current text-to-3D generation methods based on score distillation often suffer from geometric inconsistencies, leading to repeated patterns across different poses of 3D assets. This issue, known as the Multi-Face Janus problem, arises because existing methods struggle to maintain consistency across varying poses and are biased toward a canonical pose. While recent work has improved pose control an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12640v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12640v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12640v1-abstract-full" style="display: none;"> Current text-to-3D generation methods based on score distillation often suffer from geometric inconsistencies, leading to repeated patterns across different poses of 3D assets. This issue, known as the Multi-Face Janus problem, arises because existing methods struggle to maintain consistency across varying poses and are biased toward a canonical pose. While recent work has improved pose control and approximation, these efforts are still limited by this inherent bias, which skews the guidance during generation. To address this, we propose a solution called RecDreamer, which reshapes the underlying data distribution to achieve a more consistent pose representation. The core idea behind our method is to rectify the prior distribution, ensuring that pose variation is uniformly distributed rather than biased toward a canonical form. By modifying the prescribed distribution through an auxiliary function, we can reconstruct the density of the distribution to ensure compliance with specific marginal constraints. In particular, we ensure that the marginal distribution of poses follows a uniform distribution, thereby eliminating the biases introduced by the prior knowledge. We incorporate this rectified data distribution into existing score distillation algorithms, a process we refer to as uniform score distillation. To efficiently compute the posterior distribution required for the auxiliary function, RecDreamer introduces a training-free classifier that estimates pose categories in a plug-and-play manner. Additionally, we utilize various approximation techniques for noisy states, significantly improving system performance. Our experimental results demonstrate that RecDreamer effectively mitigates the Multi-Face Janus problem, leading to more consistent 3D asset generation across different poses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12640v1-abstract-full').style.display = 'none'; document.getElementById('2502.12640v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12354">arXiv:2502.12354</a> <span> [<a href="https://arxiv.org/pdf/2502.12354">pdf</a>, <a href="https://arxiv.org/format/2502.12354">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Human-centered explanation does not fit all: The interplay of sociotechnical, cognitive, and individual factors in the effect AI explanations in algorithmic decision-making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ahn%2C+Y">Yongsu Ahn</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Run Lin</a>, <a href="/search/cs?searchtype=author&query=Alikhani%2C+M">Malihe Alikhani</a>, <a href="/search/cs?searchtype=author&query=Cheon%2C+E">Eunjeong Cheon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12354v1-abstract-short" style="display: inline;"> Recent XAI studies have investigated what constitutes a \textit{good} explanation in AI-assisted decision-making. Despite the widely accepted human-friendly properties of explanations, such as contrastive and selective, existing studies have yielded inconsistent findings. To address these gaps, our study focuses on the cognitive dimensions of explanation evaluation, by evaluating six explanations… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12354v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12354v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12354v1-abstract-full" style="display: none;"> Recent XAI studies have investigated what constitutes a \textit{good} explanation in AI-assisted decision-making. Despite the widely accepted human-friendly properties of explanations, such as contrastive and selective, existing studies have yielded inconsistent findings. To address these gaps, our study focuses on the cognitive dimensions of explanation evaluation, by evaluating six explanations with different contrastive strategies and information selectivity and scrutinizing factors behind their valuation process. Our analysis results find that contrastive explanations are not the most preferable or understandable in general; Rather, different contrastive and selective explanations were appreciated to a different extent based on who they are, when, how, and what to explain -- with different level of cognitive load and engagement and sociotechnical contexts. Given these findings, we call for a nuanced view of explanation strategies, with implications for designing AI interfaces to accommodate individual and contextual differences in AI-assisted decision-making. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12354v1-abstract-full').style.display = 'none'; document.getElementById('2502.12354v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12143">arXiv:2502.12143</a> <span> [<a href="https://arxiv.org/pdf/2502.12143">pdf</a>, <a href="https://arxiv.org/format/2502.12143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Small Models Struggle to Learn from Strong Reasoners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuetai Li</a>, <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhangchen Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fengqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+L">Luyao Niu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Ramasubramanian%2C+B">Bhaskar Ramasubramanian</a>, <a href="/search/cs?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12143v1-abstract-short" style="display: inline;"> Large language models (LLMs) excel in complex reasoning tasks, and distilling their reasoning capabilities into smaller models has shown promise. However, we uncover an interesting phenomenon, which we term the Small Model Learnability Gap: small models ($\leq$3B parameters) do not consistently benefit from long chain-of-thought (CoT) reasoning or distillation from larger models. Instead, they per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12143v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12143v1-abstract-full" style="display: none;"> Large language models (LLMs) excel in complex reasoning tasks, and distilling their reasoning capabilities into smaller models has shown promise. However, we uncover an interesting phenomenon, which we term the Small Model Learnability Gap: small models ($\leq$3B parameters) do not consistently benefit from long chain-of-thought (CoT) reasoning or distillation from larger models. Instead, they perform better when fine-tuned on shorter, simpler reasoning chains that better align with their intrinsic learning capacity. To address this, we propose Mix Distillation, a simple yet effective strategy that balances reasoning complexity by combining long and short CoT examples or reasoning from both larger and smaller models. Our experiments demonstrate that Mix Distillation significantly improves small model reasoning performance compared to training on either data alone. These findings highlight the limitations of direct strong model distillation and underscore the importance of adapting reasoning complexity for effective reasoning capability transfer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12143v1-abstract-full').style.display = 'none'; document.getElementById('2502.12143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.12025">arXiv:2502.12025</a> <span> [<a href="https://arxiv.org/pdf/2502.12025">pdf</a>, <a href="https://arxiv.org/format/2502.12025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SafeChain: Safety of Language Models with Long Chain-of-Thought Reasoning Capabilities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fengqing Jiang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Zhangchen Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuetai Li</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+L">Luyao Niu</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Z">Zhen Xiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.12025v1-abstract-short" style="display: inline;"> Emerging large reasoning models (LRMs), such as DeepSeek-R1 models, leverage long chain-of-thought (CoT) reasoning to generate structured intermediate steps, enhancing their reasoning capabilities. However, long CoT does not inherently guarantee safe outputs, potentially leading to harmful consequences such as the introduction of security vulnerabilities in code or the spread of misinformation. Cu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12025v1-abstract-full').style.display = 'inline'; document.getElementById('2502.12025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.12025v1-abstract-full" style="display: none;"> Emerging large reasoning models (LRMs), such as DeepSeek-R1 models, leverage long chain-of-thought (CoT) reasoning to generate structured intermediate steps, enhancing their reasoning capabilities. However, long CoT does not inherently guarantee safe outputs, potentially leading to harmful consequences such as the introduction of security vulnerabilities in code or the spread of misinformation. Current research on large language model (LLM) safety usually focuses on short-answer responses, overlooking the long CoT style outputs of LRMs. To bridge this gap, we conduct a systematic study of LRM safety. First, we investigate safety evaluators calibrated against human annotations. Using our newly developed metrics, we thoroughly assess the safety of 12 state-of-the-art LRMs on StrongReject and WildJailbreak datasets. Our results show that LRMs are not safe compared to their reasoning advance. Further, we perform a fine-grained analysis of the reasoning trace and final answer. We find that three decoding strategies-ZeroThink, LessThink, and MoreThink-can improve model safety without additional training. However, these strategies either use constrained reasoning traces or incur high inference costs. To better strengthen LRM safety, we introduce SafeChain, the first-of-its-kind safety training dataset in CoT style. We fine-tune two LRMs with SafeChain, showing that it not only enhances model safety but also preserves performance across 6 reasoning benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.12025v1-abstract-full').style.display = 'none'; document.getElementById('2502.12025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11712">arXiv:2502.11712</a> <span> [<a href="https://arxiv.org/pdf/2502.11712">pdf</a>, <a href="https://arxiv.org/format/2502.11712">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Component-aware Unsupervised Logical Anomaly Generation for Industrial Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tong%2C+X">Xuan Tong</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yang Chang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Q">Qing Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jiawen Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boyang Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Junxiong Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuxuan Lin</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+X">Xinji Mai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Z">Zeng Tao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yan Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenqiang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11712v1-abstract-short" style="display: inline;"> Anomaly detection is critical in industrial manufacturing for ensuring product quality and improving efficiency in automated processes. The scarcity of anomalous samples limits traditional detection methods, making anomaly generation essential for expanding the data repository. However, recent generative models often produce unrealistic anomalies increasing false positives, or require real-world a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11712v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11712v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11712v1-abstract-full" style="display: none;"> Anomaly detection is critical in industrial manufacturing for ensuring product quality and improving efficiency in automated processes. The scarcity of anomalous samples limits traditional detection methods, making anomaly generation essential for expanding the data repository. However, recent generative models often produce unrealistic anomalies increasing false positives, or require real-world anomaly samples for training. In this work, we treat anomaly generation as a compositional problem and propose ComGEN, a component-aware and unsupervised framework that addresses the gap in logical anomaly generation. Our method comprises a multi-component learning strategy to disentangle visual components, followed by subsequent generation editing procedures. Disentangled text-to-component pairs, revealing intrinsic logical constraints, conduct attention-guided residual mapping and model training with iteratively matched references across multiple scales. Experiments on the MVTecLOCO dataset confirm the efficacy of ComGEN, achieving the best AUROC score of 91.2%. Additional experiments on the real-world scenario of Diesel Engine and widely-used MVTecAD dataset demonstrate significant performance improvements when integrating simulated anomalies generated by ComGEN into automated production workflows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11712v1-abstract-full').style.display = 'none'; document.getElementById('2502.11712v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11703">arXiv:2502.11703</a> <span> [<a href="https://arxiv.org/pdf/2502.11703">pdf</a>, <a href="https://arxiv.org/format/2502.11703">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CMQCIC-Bench: A Chinese Benchmark for Evaluating Large Language Models in Medical Quality Control Indicator Calculation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guangya Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanhao Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zongying Jiang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yuxiong Jin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+L">Li Dai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yupian Lin</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+R">Ruihui Hou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yongqi Fan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Q">Qi Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingping Liu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+T">Tong Ruan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11703v1-abstract-short" style="display: inline;"> Medical quality control indicators are essential to assess the qualifications of healthcare institutions for medical services. With the impressive performance of large language models (LLMs) like GPT-4 in the medical field, leveraging these technologies for the Medical Quality Control Indicator Calculation (MQCIC) presents a promising approach. In this work, (1) we introduce a real-world task MQCI… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11703v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11703v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11703v1-abstract-full" style="display: none;"> Medical quality control indicators are essential to assess the qualifications of healthcare institutions for medical services. With the impressive performance of large language models (LLMs) like GPT-4 in the medical field, leveraging these technologies for the Medical Quality Control Indicator Calculation (MQCIC) presents a promising approach. In this work, (1) we introduce a real-world task MQCIC and propose an open-source Chinese electronic medical records (EMRs)-based dataset (CMQCIC-Bench) comprising 785 instances and 76 indicators. (2) We propose a semi-automatic method to enhance the rule representation. Then we propose the Clinical Facts-based Inferential Rule (CF-IR) method that disentangles the clinical fact verification and inferential rule reasoning actions. (3) We conduct comprehensive experiments on 20 representative LLMs, covering general and medical models. Our findings reveal that CF-IR outperforms Chain-of-Thought methods in MQCIC tasks. (4) We conduct an error analysis and investigate the capabilities of clinical fact verification and inferential rule reasoning, providing insights to improve performance in the MQCIC further. The dataset and code is available in this repo https://anonymous.4open.science/r/C-MQCIC-1151. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11703v1-abstract-full').style.display = 'none'; document.getElementById('2502.11703v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11514">arXiv:2502.11514</a> <span> [<a href="https://arxiv.org/pdf/2502.11514">pdf</a>, <a href="https://arxiv.org/format/2502.11514">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Investigating Inference-time Scaling for Chain of Multi-modal Thought: A Preliminary Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujie Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ante Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Moye Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jingyao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+J">Jinsong Su</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+X">Xinyan Xiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11514v1-abstract-short" style="display: inline;"> Recently, inference-time scaling of chain-of-thought (CoT) has been demonstrated as a promising approach for addressing multi-modal reasoning tasks. While existing studies have predominantly centered on text-based thinking, the integration of both visual and textual modalities within the reasoning process remains unexplored. In this study, we pioneer the exploration of inference-time scaling with… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11514v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11514v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11514v1-abstract-full" style="display: none;"> Recently, inference-time scaling of chain-of-thought (CoT) has been demonstrated as a promising approach for addressing multi-modal reasoning tasks. While existing studies have predominantly centered on text-based thinking, the integration of both visual and textual modalities within the reasoning process remains unexplored. In this study, we pioneer the exploration of inference-time scaling with multi-modal thought, aiming to bridge this gap. To provide a comprehensive analysis, we systematically investigate popular sampling-based and tree search-based inference-time scaling methods on 10 challenging tasks spanning various domains. Besides, we uniformly adopt a consistency-enhanced verifier to ensure effective guidance for both methods across different thought paradigms. Results show that multi-modal thought promotes better performance against conventional text-only thought, and blending the two types of thought fosters more diverse thinking. Despite these advantages, multi-modal thoughts necessitate higher token consumption for processing richer visual inputs, which raises concerns in practical applications. We hope that our findings on the merits and drawbacks of this research line will inspire future works in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11514v1-abstract-full').style.display = 'none'; document.getElementById('2502.11514v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11168">arXiv:2502.11168</a> <span> [<a href="https://arxiv.org/pdf/2502.11168">pdf</a>, <a href="https://arxiv.org/format/2502.11168">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Knowing Your Target: Target-Aware Transformer Makes Better Spatio-Temporal Video Grounding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xin Gu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Y">Yaojie Shen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chenxi Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+T">Tiejian Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuewei Lin</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Heng Fan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Libo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11168v1-abstract-short" style="display: inline;"> Transformer has attracted increasing interest in STVG, owing to its end-to-end pipeline and promising result. Existing Transformer-based STVG approaches often leverage a set of object queries, which are initialized simply using zeros and then gradually learn target position information via iterative interactions with multimodal features, for spatial and temporal localization. Despite simplicity, t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11168v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11168v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11168v1-abstract-full" style="display: none;"> Transformer has attracted increasing interest in STVG, owing to its end-to-end pipeline and promising result. Existing Transformer-based STVG approaches often leverage a set of object queries, which are initialized simply using zeros and then gradually learn target position information via iterative interactions with multimodal features, for spatial and temporal localization. Despite simplicity, these zero object queries, due to lacking target-specific cues, are hard to learn discriminative target information from interactions with multimodal features in complicated scenarios (\e.g., with distractors or occlusion), resulting in degradation. Addressing this, we introduce a novel Target-Aware Transformer for STVG (TA-STVG), which seeks to adaptively generate object queries via exploring target-specific cues from the given video-text pair, for improving STVG. The key lies in two simple yet effective modules, comprising text-guided temporal sampling (TTS) and attribute-aware spatial activation (ASA), working in a cascade. The former focuses on selecting target-relevant temporal cues from a video utilizing holistic text information, while the latter aims at further exploiting the fine-grained visual attribute information of the object from previous target-aware temporal cues, which is applied for object query initialization. Compared to existing methods leveraging zero-initialized queries, object queries in our TA-STVG, directly generated from a given video-text pair, naturally carry target-specific cues, making them adaptive and better interact with multimodal features for learning more discriminative information to improve STVG. In our experiments on three benchmarks, TA-STVG achieves state-of-the-art performance and significantly outperforms the baseline, validating its efficacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11168v1-abstract-full').style.display = 'none'; document.getElementById('2502.11168v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11107">arXiv:2502.11107</a> <span> [<a href="https://arxiv.org/pdf/2502.11107">pdf</a>, <a href="https://arxiv.org/format/2502.11107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Revisiting Weak-to-Strong Generalization in Theory and Practice: Reverse KL vs. Forward KL </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenkai Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11107v1-abstract-short" style="display: inline;"> As large language models advance toward superhuman performance, ensuring their alignment with human values and abilities grows increasingly complex. Weak-to-strong generalization offers a promising approach by leveraging predictions from weaker models to guide stronger systems, but its effectiveness could be constrained by the inherent noise and inaccuracies in these weak predictions. To address t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11107v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11107v1-abstract-full" style="display: none;"> As large language models advance toward superhuman performance, ensuring their alignment with human values and abilities grows increasingly complex. Weak-to-strong generalization offers a promising approach by leveraging predictions from weaker models to guide stronger systems, but its effectiveness could be constrained by the inherent noise and inaccuracies in these weak predictions. To address this, we propose a theoretically grounded approach that replaces forward KL divergence-whose mass-covering behavior risks overfitting to imperfect weak signals-with reverse KL divergence. Reverse KL divergence's zero-forcing effect prioritizes high-confidence predictions, effectively mitigating the influence of unreliable weak supervision. Theoretically, we extend existing bounds and derive tighter lower bounds for both forward and reverse KL divergence, establishing that reverse KL achieves at least comparable guarantees to forward KL. Notably, when a sufficiently pre-trained strong model is fine-tuned on the last layer, reverse KL uniquely guarantees that it outperforms its weak supervisor by the magnitude of their disagreement-a guarantee that forward KL cannot provide. Empirically, we demonstrate that reverse KL and reverse cross-entropy enable strong models to consistently outperform those trained with forward KL and standard cross-entropy across most settings, highlighting the practical advantages of these reverse losses. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11107v1-abstract-full').style.display = 'none'; document.getElementById('2502.11107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10467">arXiv:2502.10467</a> <span> [<a href="https://arxiv.org/pdf/2502.10467">pdf</a>, <a href="https://arxiv.org/format/2502.10467">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> YNote: A Novel Music Notation for Fine-Tuning LLMs in Music Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+S">Shao-Chien Lu</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+C">Chen-Chen Yeh</a>, <a href="/search/cs?searchtype=author&query=Cho%2C+H">Hui-Lin Cho</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chun-Chieh Hsu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+T">Tsai-Ling Hsu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cheng-Han Wu</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+T+K">Timothy K. Shih</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Cheng Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10467v1-abstract-short" style="display: inline;"> The field of music generation using Large Language Models (LLMs) is evolving rapidly, yet existing music notation systems, such as MIDI, ABC Notation, and MusicXML, remain too complex for effective fine-tuning of LLMs. These formats are difficult for both machines and humans to interpret due to their variability and intricate structure. To address these challenges, we introduce YNote, a simplified… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10467v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10467v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10467v1-abstract-full" style="display: none;"> The field of music generation using Large Language Models (LLMs) is evolving rapidly, yet existing music notation systems, such as MIDI, ABC Notation, and MusicXML, remain too complex for effective fine-tuning of LLMs. These formats are difficult for both machines and humans to interpret due to their variability and intricate structure. To address these challenges, we introduce YNote, a simplified music notation system that uses only four characters to represent a note and its pitch. YNote's fixed format ensures consistency, making it easy to read and more suitable for fine-tuning LLMs. In our experiments, we fine-tuned GPT-2 (124M) on a YNote-encoded dataset and achieved BLEU and ROUGE scores of 0.883 and 0.766, respectively. With just two notes as prompts, the model was able to generate coherent and stylistically relevant music. We believe YNote offers a practical alternative to existing music notations for machine learning applications and has the potential to significantly enhance the quality of music generation using LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10467v1-abstract-full').style.display = 'none'; document.getElementById('2502.10467v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10059">arXiv:2502.10059</a> <span> [<a href="https://arxiv.org/pdf/2502.10059">pdf</a>, <a href="https://arxiv.org/format/2502.10059">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> RealCam-I2V: Real-World Image-to-Video Generation with Interactive Complex Camera Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+G">Guangcong Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Rui Jiang</a>, <a href="/search/cs?searchtype=author&query=Shuigenzhan"> Shuigenzhan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yehao Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yining Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xi Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10059v1-abstract-short" style="display: inline;"> Recent advancements in camera-trajectory-guided image-to-video generation offer higher precision and better support for complex camera control compared to text-based approaches. However, they also introduce significant usability challenges, as users often struggle to provide precise camera parameters when working with arbitrary real-world images without knowledge of their depth nor scene scale. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10059v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10059v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10059v1-abstract-full" style="display: none;"> Recent advancements in camera-trajectory-guided image-to-video generation offer higher precision and better support for complex camera control compared to text-based approaches. However, they also introduce significant usability challenges, as users often struggle to provide precise camera parameters when working with arbitrary real-world images without knowledge of their depth nor scene scale. To address these real-world application issues, we propose RealCam-I2V, a novel diffusion-based video generation framework that integrates monocular metric depth estimation to establish 3D scene reconstruction in a preprocessing step. During training, the reconstructed 3D scene enables scaling camera parameters from relative to absolute values, ensuring compatibility and scale consistency across diverse real-world images. In inference, RealCam-I2V offers an intuitive interface where users can precisely draw camera trajectories by dragging within the 3D scene. To further enhance precise camera control and scene consistency, we propose scene-constrained noise shaping, which shapes high-level noise and also allows the framework to maintain dynamic, coherent video generation in lower noise stages. RealCam-I2V achieves significant improvements in controllability and video quality on the RealEstate10K and out-of-domain images. We further enables applications like camera-controlled looping video generation and generative frame interpolation. We will release our absolute-scale annotation, codes, and all checkpoints. Please see dynamic results in https://zgctroy.github.io/RealCam-I2V. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10059v1-abstract-full').style.display = 'none'; document.getElementById('2502.10059v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09992">arXiv:2502.09992</a> <span> [<a href="https://arxiv.org/pdf/2502.09992">pdf</a>, <a href="https://arxiv.org/format/2502.09992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Large Language Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+S">Shen Nie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+F">Fengqi Zhu</a>, <a href="/search/cs?searchtype=author&query=You%2C+Z">Zebin You</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaolu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Jingyang Ou</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jun Hu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chongxuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09992v2-abstract-short" style="display: inline;"> Autoregressive models (ARMs) are widely regarded as the cornerstone of large language models (LLMs). We challenge this notion by introducing LLaDA, a diffusion model trained from scratch under the pre-training and supervised fine-tuning (SFT) paradigm. LLaDA models distributions through a forward data masking process and a reverse process, parameterized by a vanilla Transformer to predict masked t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09992v2-abstract-full').style.display = 'inline'; document.getElementById('2502.09992v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09992v2-abstract-full" style="display: none;"> Autoregressive models (ARMs) are widely regarded as the cornerstone of large language models (LLMs). We challenge this notion by introducing LLaDA, a diffusion model trained from scratch under the pre-training and supervised fine-tuning (SFT) paradigm. LLaDA models distributions through a forward data masking process and a reverse process, parameterized by a vanilla Transformer to predict masked tokens. By optimizing a likelihood bound, it provides a principled generative approach for probabilistic inference. Across extensive benchmarks, LLaDA demonstrates strong scalability, outperforming our self-constructed ARM baselines. Remarkably, LLaDA 8B is competitive with strong LLMs like LLaMA3 8B in in-context learning and, after SFT, exhibits impressive instruction-following abilities in case studies such as multi-turn dialogue. Moreover, LLaDA addresses the reversal curse, surpassing GPT-4o in a reversal poem completion task. Our findings establish diffusion models as a viable and promising alternative to ARMs, challenging the assumption that key LLM capabilities discussed above are inherently tied to ARMs. Project page and codes: https://ml-gsai.github.io/LLaDA-demo/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09992v2-abstract-full').style.display = 'none'; document.getElementById('2502.09992v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09940">arXiv:2502.09940</a> <span> [<a href="https://arxiv.org/pdf/2502.09940">pdf</a>, <a href="https://arxiv.org/format/2502.09940">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> A Preliminary Exploration with GPT-4o Voice Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09940v1-abstract-short" style="display: inline;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command clas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09940v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09940v1-abstract-full" style="display: none;"> With the rise of multimodal large language models, GPT-4o stands out as a pioneering model, driving us to evaluate its capabilities. This report assesses GPT-4o across various tasks to analyze its audio processing and reasoning abilities. We find that GPT-4o exhibits strong knowledge in audio, speech, and music understanding, performing well in tasks like intent classification, spoken command classification, semantic and grammatical reasoning., multilingual speech recognition, and singing analysis. It also shows greater robustness against hallucinations than other large audio-language models (LALMs). However, it struggles with tasks such as audio duration prediction and instrument classification. Additionally, GPT-4o's safety mechanisms cause it to decline tasks like speaker identification, age classification, MOS prediction, and audio deepfake detection. Notably, the model exhibits a significantly different refusal rate when responding to speaker verification tasks on different datasets. This is likely due to variations in the accompanying instructions or the quality of the input audio, suggesting the sensitivity of its built-in safeguards. Finally, we acknowledge that model performance varies with evaluation protocols. This report only serves as a preliminary exploration of the current state of LALMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09940v1-abstract-full').style.display = 'none'; document.getElementById('2502.09940v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09854">arXiv:2502.09854</a> <span> [<a href="https://arxiv.org/pdf/2502.09854">pdf</a>, <a href="https://arxiv.org/format/2502.09854">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Efficient Multitask Learning in Small Language Models Through Upside-Down Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Chen Lin</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+S">Sanat Sharma</a>, <a href="/search/cs?searchtype=author&query=Manikandan%2C+H">Hari Manikandan</a>, <a href="/search/cs?searchtype=author&query=Kumar%2C+J">Jayant Kumar</a>, <a href="/search/cs?searchtype=author&query=King%2C+T+H">Tracy Holloway King</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jing Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09854v1-abstract-short" style="display: inline;"> In this work, we demonstrate that small language models (SLMs), specifically a 100M parameter GPT-2 model, can achieve competitive performance in multitask prompt generation tasks while requiring only a fraction of the computational resources needed by large language models (LLMs). Through a novel combination of upside-down reinforcement learning and synthetic data distillation from a powerful LLM… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09854v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09854v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09854v1-abstract-full" style="display: none;"> In this work, we demonstrate that small language models (SLMs), specifically a 100M parameter GPT-2 model, can achieve competitive performance in multitask prompt generation tasks while requiring only a fraction of the computational resources needed by large language models (LLMs). Through a novel combination of upside-down reinforcement learning and synthetic data distillation from a powerful LLM, Llama-3, we train an SLM that achieves relevance scores within 5% of state-of-the-art models, including Llama-3, Qwen2, and Mistral, despite being up to 80 times smaller, making it highly suitable for resource-constrained and real-time applications. This study highlights the potential of SLMs as efficient multitask learners in multimodal settings, providing a promising alternative to LLMs for scalable, low-latency deployments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09854v1-abstract-full').style.display = 'none'; document.getElementById('2502.09854v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09791">arXiv:2502.09791</a> <span> [<a href="https://arxiv.org/pdf/2502.09791">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Atom identification in bilayer moire materials with Gomb-Net </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Houston%2C+A+C">Austin C. Houston</a>, <a href="/search/cs?searchtype=author&query=Harris%2C+S+B">Sumner B. Harris</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Chuan Lin</a>, <a href="/search/cs?searchtype=author&query=Geohegan%2C+D+B">David B. Geohegan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+K">Kai Xiao</a>, <a href="/search/cs?searchtype=author&query=Duscher%2C+G">Gerd Duscher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09791v1-abstract-short" style="display: inline;"> Moire patterns in van der Waals bilayer materials complicate the analysis of atomic-resolution images, hindering the atomic-scale insight typically attainable with scanning transmission electron microscopy. Here, we report a method to detect the positions and identity of atoms in each of the individual layers that compose bilayer heterostructures. We developed a deep learning model, Gomb-Net, whic… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09791v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09791v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09791v1-abstract-full" style="display: none;"> Moire patterns in van der Waals bilayer materials complicate the analysis of atomic-resolution images, hindering the atomic-scale insight typically attainable with scanning transmission electron microscopy. Here, we report a method to detect the positions and identity of atoms in each of the individual layers that compose bilayer heterostructures. We developed a deep learning model, Gomb-Net, which can distinguish atomic species in each individual layer, effectively deconvoluting the moire pattern to enable layer-specific mapping of strain and dopant distributions, unlike other methods which struggle with moire-induced complexity. Using this approach, we explored Se atom substitutional sites in a twisted fractional Janus WS2-WS2(1-x)Se2x heterostructure and found that layer specific implantation sites are unaffected by the moire pattern's local energetic or electronic modulation. This advancement enables atom-identification within material regimes where it was not possible before, opening new insights into previously inaccessible material physics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09791v1-abstract-full').style.display = 'none'; document.getElementById('2502.09791v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07870">arXiv:2502.07870</a> <span> [<a href="https://arxiv.org/pdf/2502.07870">pdf</a>, <a href="https://arxiv.org/format/2502.07870">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TextAtlas5M: A Large-scale Dataset for Dense Text Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A+J">Alex Jinpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+D">Dongxing Mao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+W">Weiming Han</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhuobai Dong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linjie Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiqi Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+L">Libo Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fuwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Min Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07870v1-abstract-short" style="display: inline;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the gene… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07870v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07870v1-abstract-full" style="display: none;"> Text-conditioned image generation has gained significant attention in recent years and are processing increasingly longer and comprehensive text prompt. In everyday life, dense and intricate text appears in contexts like advertisements, infographics, and signage, where the integration of both text and visuals is essential for conveying complex information. However, despite these advances, the generation of images containing long-form text remains a persistent challenge, largely due to the limitations of existing datasets, which often focus on shorter and simpler text. To address this gap, we introduce TextAtlas5M, a novel dataset specifically designed to evaluate long-text rendering in text-conditioned image generation. Our dataset consists of 5 million long-text generated and collected images across diverse data types, enabling comprehensive evaluation of large-scale generative models on long-text image generation. We further curate 3000 human-improved test set TextAtlasEval across 3 data domains, establishing one of the most extensive benchmarks for text-conditioned generation. Evaluations suggest that the TextAtlasEval benchmarks present significant challenges even for the most advanced proprietary models (e.g. GPT4o with DallE-3), while their open-source counterparts show an even larger performance gap. These evidences position TextAtlas5M as a valuable dataset for training and evaluating future-generation text-conditioned image generation models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07870v1-abstract-full').style.display = 'none'; document.getElementById('2502.07870v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 15 figures. Dataset Website: https://textatlas5m.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07707">arXiv:2502.07707</a> <span> [<a href="https://arxiv.org/pdf/2502.07707">pdf</a>, <a href="https://arxiv.org/format/2502.07707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PRVQL: Progressive Knowledge-guided Refinement for Robust Egocentric Visual Query Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+B">Bing Fan</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yunhe Feng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yapeng Tian</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuewei Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+H">Heng Fan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07707v1-abstract-short" style="display: inline;"> Egocentric visual query localization (EgoVQL) focuses on localizing the target of interest in space and time from first-person videos, given a visual query. Despite recent progressive, existing methods often struggle to handle severe object appearance changes and cluttering background in the video due to lacking sufficient target cues, leading to degradation. Addressing this, we introduce PRVQL, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07707v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07707v1-abstract-full" style="display: none;"> Egocentric visual query localization (EgoVQL) focuses on localizing the target of interest in space and time from first-person videos, given a visual query. Despite recent progressive, existing methods often struggle to handle severe object appearance changes and cluttering background in the video due to lacking sufficient target cues, leading to degradation. Addressing this, we introduce PRVQL, a novel Progressive knowledge-guided Refinement framework for EgoVQL. The core is to continuously exploit target-relevant knowledge directly from videos and utilize it as guidance to refine both query and video features for improving target localization. Our PRVQL contains multiple processing stages. The target knowledge from one stage, comprising appearance and spatial knowledge extracted via two specially designed knowledge learning modules, are utilized as guidance to refine the query and videos features for the next stage, which are used to generate more accurate knowledge for further feature refinement. With such a progressive process, target knowledge in PRVQL can be gradually improved, which, in turn, leads to better refined query and video features for localization in the final stage. Compared to previous methods, our PRVQL, besides the given object cues, enjoys additional crucial target information from a video as guidance to refine features, and hence enhances EgoVQL in complicated scenes. In our experiments on challenging Ego4D, PRVQL achieves state-of-the-art result and largely surpasses other methods, showing its efficacy. Our code, model and results will be released at https://github.com/fb-reps/PRVQL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07707v1-abstract-full').style.display = 'none'; document.getElementById('2502.07707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07640">arXiv:2502.07640</a> <span> [<a href="https://arxiv.org/pdf/2502.07640">pdf</a>, <a href="https://arxiv.org/format/2502.07640">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Goedel-Prover: A Frontier Model for Open-Source Automated Theorem Proving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shange Tang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+B">Bohan Lyu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiayun Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Hongzhou Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kaiyu Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danqi Chen</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Sanjeev Arora</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07640v2-abstract-short" style="display: inline;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'inline'; document.getElementById('2502.07640v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07640v2-abstract-full" style="display: none;"> We introduce Goedel-Prover, an open-source large language model (LLM) that achieves the state-of-the-art (SOTA) performance in automated formal proof generation for mathematical problems. The key challenge in this field is the scarcity of formalized math statements and proofs, which we tackle in the following ways. We train statement formalizers to translate the natural language math problems from Numina into formal language (Lean 4), creating a dataset of 1.64 million formal statements. LLMs are used to check that the formal statements accurately preserve the content of the original natural language problems. We then iteratively build a large dataset of formal proofs by training a series of provers. Each prover succeeds in proving many statements that the previous ones could not, and these new proofs are added to the training set for the next prover. Despite using only supervised fine-tuning, our final prover significantly outperforms the previous best open-source model, DeepSeek-Prover-V1.5, which employs reinforcement learning. On the miniF2F benchmark, our model achieves a success rate of 57.6% (Pass@32), surpassing DeepSeek-Prover-V1.5 by 7.6%. On PutnamBench, Goedel-Prover successfully solves 7 problems (Pass@512), ranking first on the leaderboard. Furthermore, it generates 29.7K formal proofs for Lean Workbook problems, nearly doubling the 15.7K produced by earlier works. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07640v2-abstract-full').style.display = 'none'; document.getElementById('2502.07640v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07615">arXiv:2502.07615</a> <span> [<a href="https://arxiv.org/pdf/2502.07615">pdf</a>, <a href="https://arxiv.org/format/2502.07615">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Flow Distillation Sampling: Regularizing 3D Gaussians with Pre-trained Matching Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lin-Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kangjie Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Youtian Lin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Siyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhihao Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xun Cao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yao Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07615v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting (3DGS) has achieved excellent rendering quality with fast training and rendering speed. However, its optimization process lacks explicit geometric constraints, leading to suboptimal geometric reconstruction in regions with sparse or no observational input views. In this work, we try to mitigate the issue by incorporating a pre-trained matching prior to the 3DGS optimization p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07615v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07615v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07615v1-abstract-full" style="display: none;"> 3D Gaussian Splatting (3DGS) has achieved excellent rendering quality with fast training and rendering speed. However, its optimization process lacks explicit geometric constraints, leading to suboptimal geometric reconstruction in regions with sparse or no observational input views. In this work, we try to mitigate the issue by incorporating a pre-trained matching prior to the 3DGS optimization process. We introduce Flow Distillation Sampling (FDS), a technique that leverages pre-trained geometric knowledge to bolster the accuracy of the Gaussian radiance field. Our method employs a strategic sampling technique to target unobserved views adjacent to the input views, utilizing the optical flow calculated from the matching model (Prior Flow) to guide the flow analytically calculated from the 3DGS geometry (Radiance Flow). Comprehensive experiments in depth rendering, mesh reconstruction, and novel view synthesis showcase the significant advantages of FDS over state-of-the-art methods. Additionally, our interpretive experiments and analysis aim to shed light on the effects of FDS on geometric accuracy and rendering quality, potentially providing readers with insights into its performance. Project page: https://nju-3dv.github.io/projects/fds <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07615v1-abstract-full').style.display = 'none'; document.getElementById('2502.07615v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06756">arXiv:2502.06756</a> <span> [<a href="https://arxiv.org/pdf/2502.06756">pdf</a>, <a href="https://arxiv.org/format/2502.06756">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAMRefiner: Taming Segment Anything Model for Universal Mask Refinement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuqi Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hengjia Li</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zheng Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Jun Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xiaofei He</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+P">Ping Luo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaipeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06756v1-abstract-short" style="display: inline;"> In this paper, we explore a principal way to enhance the quality of widely pre-existing coarse masks, enabling them to serve as reliable training data for segmentation models to reduce the annotation cost. In contrast to prior refinement techniques that are tailored to specific models or tasks in a close-world manner, we propose SAMRefiner, a universal and efficient approach by adapting SAM to the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06756v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06756v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06756v1-abstract-full" style="display: none;"> In this paper, we explore a principal way to enhance the quality of widely pre-existing coarse masks, enabling them to serve as reliable training data for segmentation models to reduce the annotation cost. In contrast to prior refinement techniques that are tailored to specific models or tasks in a close-world manner, we propose SAMRefiner, a universal and efficient approach by adapting SAM to the mask refinement task. The core technique of our model is the noise-tolerant prompting scheme. Specifically, we introduce a multi-prompt excavation strategy to mine diverse input prompts for SAM (i.e., distance-guided points, context-aware elastic bounding boxes, and Gaussian-style masks) from initial coarse masks. These prompts can collaborate with each other to mitigate the effect of defects in coarse masks. In particular, considering the difficulty of SAM to handle the multi-object case in semantic segmentation, we introduce a split-then-merge (STM) pipeline. Additionally, we extend our method to SAMRefiner++ by introducing an additional IoU adaption step to further boost the performance of the generic SAMRefiner on the target dataset. This step is self-boosted and requires no additional annotation. The proposed framework is versatile and can flexibly cooperate with existing segmentation methods. We evaluate our mask framework on a wide range of benchmarks under different settings, demonstrating better accuracy and efficiency. SAMRefiner holds significant potential to expedite the evolution of refinement tools. Our code is available at https://github.com/linyq2117/SAMRefiner. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06756v1-abstract-full').style.display = 'none'; document.getElementById('2502.06756v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06243">arXiv:2502.06243</a> <span> [<a href="https://arxiv.org/pdf/2502.06243">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multi-Scale Transformer Architecture for Accurate Medical Image Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jiacheng Hu</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+Y">Yanlin Xiang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yang Lin</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Junliang Du</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanchao Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Houze Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06243v1-abstract-short" style="display: inline;"> This study introduces an AI-driven skin lesion classification algorithm built on an enhanced Transformer architecture, addressing the challenges of accuracy and robustness in medical image analysis. By integrating a multi-scale feature fusion mechanism and refining the self-attention process, the model effectively extracts both global and local features, enhancing its ability to detect lesions wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06243v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06243v1-abstract-full" style="display: none;"> This study introduces an AI-driven skin lesion classification algorithm built on an enhanced Transformer architecture, addressing the challenges of accuracy and robustness in medical image analysis. By integrating a multi-scale feature fusion mechanism and refining the self-attention process, the model effectively extracts both global and local features, enhancing its ability to detect lesions with ambiguous boundaries and intricate structures. Performance evaluation on the ISIC 2017 dataset demonstrates that the improved Transformer surpasses established AI models, including ResNet50, VGG19, ResNext, and Vision Transformer, across key metrics such as accuracy, AUC, F1-Score, and Precision. Grad-CAM visualizations further highlight the interpretability of the model, showcasing strong alignment between the algorithm's focus areas and actual lesion sites. This research underscores the transformative potential of advanced AI models in medical imaging, paving the way for more accurate and reliable diagnostic tools. Future work will explore the scalability of this approach to broader medical imaging tasks and investigate the integration of multimodal data to enhance AI-driven diagnostic frameworks for intelligent healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06243v1-abstract-full').style.display = 'none'; document.getElementById('2502.06243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06190">arXiv:2502.06190</a> <span> [<a href="https://arxiv.org/pdf/2502.06190">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Digital Libraries">cs.DL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Is Science Inevitable? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Linzhuo Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiling Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lingfei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06190v2-abstract-short" style="display: inline;"> Using large-scale citation data and a breakthrough metric, the study systematically evaluates the inevitability of scientific breakthroughs. We find that scientific breakthroughs emerge as multiple discoveries rather than singular events. Through analysis of over 40 million journal articles, we identify multiple discoveries as papers that independently displace the same reference using the Disrupt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06190v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06190v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06190v2-abstract-full" style="display: none;"> Using large-scale citation data and a breakthrough metric, the study systematically evaluates the inevitability of scientific breakthroughs. We find that scientific breakthroughs emerge as multiple discoveries rather than singular events. Through analysis of over 40 million journal articles, we identify multiple discoveries as papers that independently displace the same reference using the Disruption Index (D-index), suggesting functional equivalence. Our findings support Merton's core argument that scientific discoveries arise from historical context rather than individual genius. The results reveal a long-tail distribution pattern of multiple discoveries across various datasets, challenging Merton's Poisson model while reinforcing the structural inevitability of scientific progress. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06190v2-abstract-full').style.display = 'none'; document.getElementById('2502.06190v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05176">arXiv:2502.05176</a> <span> [<a href="https://arxiv.org/pdf/2502.05176">pdf</a>, <a href="https://arxiv.org/format/2502.05176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> AuraFusion360: Augmented Unseen Region Alignment for Reference-based 360掳 Unbounded Scene Inpainting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Chung-Ho Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yang-Jung Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying-Huan Chen</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jie-Ying Lee</a>, <a href="/search/cs?searchtype=author&query=Ke%2C+B">Bo-Hsu Ke</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+C+T">Chun-Wei Tuan Mu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yi-Chuan Huang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chin-Yang Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Hung Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Yu Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yu-Lun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05176v1-abstract-short" style="display: inline;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introdu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05176v1-abstract-full" style="display: none;"> Three-dimensional scene inpainting is crucial for applications from virtual reality to architectural visualization, yet existing methods struggle with view consistency and geometric accuracy in 360掳 unbounded scenes. We present AuraFusion360, a novel reference-based method that enables high-quality object removal and hole filling in 3D scenes represented by Gaussian Splatting. Our approach introduces (1) depth-aware unseen mask generation for accurate occlusion identification, (2) Adaptive Guided Depth Diffusion, a zero-shot method for accurate initial point placement without requiring additional training, and (3) SDEdit-based detail enhancement for multi-view coherence. We also introduce 360-USID, the first comprehensive dataset for 360掳 unbounded scene inpainting with ground truth. Extensive experiments demonstrate that AuraFusion360 significantly outperforms existing methods, achieving superior perceptual quality while maintaining geometric accuracy across dramatic viewpoint changes. See our project page for video results and the dataset at https://kkennethwu.github.io/aurafusion360/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05176v1-abstract-full').style.display = 'none'; document.getElementById('2502.05176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kkennethwu.github.io/aurafusion360/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04392">arXiv:2502.04392</a> <span> [<a href="https://arxiv.org/pdf/2502.04392">pdf</a>, <a href="https://arxiv.org/format/2502.04392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Division-of-Thoughts: Harnessing Hybrid Language Model Synergy for Efficient On-Device Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shao%2C+C">Chenyang Shao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xinyuan Hu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yutang Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+F">Fengli Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04392v1-abstract-short" style="display: inline;"> The rapid expansion of web content has made on-device AI assistants indispensable for helping users manage the increasing complexity of online tasks. The emergent reasoning ability in large language models offer a promising path for next-generation on-device AI agents. However, deploying full-scale Large Language Models (LLMs) on resource-limited local devices is challenging. In this paper, we pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04392v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04392v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04392v1-abstract-full" style="display: none;"> The rapid expansion of web content has made on-device AI assistants indispensable for helping users manage the increasing complexity of online tasks. The emergent reasoning ability in large language models offer a promising path for next-generation on-device AI agents. However, deploying full-scale Large Language Models (LLMs) on resource-limited local devices is challenging. In this paper, we propose Division-of-Thoughts (DoT), a collaborative reasoning framework leveraging the synergy between locally deployed Smaller-scale Language Models (SLMs) and cloud-based LLMs. DoT leverages a Task Decomposer to elicit the inherent planning abilities in language models to decompose user queries into smaller sub-tasks, which allows hybrid language models to fully exploit their respective strengths. Besides, DoT employs a Task Scheduler to analyze the pair-wise dependency of sub-tasks and create a dependency graph, facilitating parallel reasoning of sub-tasks and the identification of key steps. To allocate the appropriate model based on the difficulty of sub-tasks, DoT leverages a Plug-and-Play Adapter, which is an additional task head attached to the SLM that does not alter the SLM's parameters. To boost adapter's task allocation capability, we propose a self-reinforced training method that relies solely on task execution feedback. Extensive experiments on various benchmarks demonstrate that our DoT significantly reduces LLM costs while maintaining competitive reasoning accuracy. Specifically, DoT reduces the average reasoning time and API costs by 66.12% and 83.57%, while achieving comparable reasoning accuracy with the best baseline methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04392v1-abstract-full').style.display = 'none'; document.getElementById('2502.04392v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03937">arXiv:2502.03937</a> <span> [<a href="https://arxiv.org/pdf/2502.03937">pdf</a>, <a href="https://arxiv.org/format/2502.03937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Quantifying Correlations of Machine Learning Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yuanyuan Li</a>, <a href="/search/cs?searchtype=author&query=Sarna%2C+N">Neeraj Sarna</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yang Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03937v1-abstract-short" style="display: inline;"> Machine Learning models are being extensively used in safety critical applications where errors from these models could cause harm to the user. Such risks are amplified when multiple machine learning models, which are deployed concurrently, interact and make errors simultaneously. This paper explores three scenarios where error correlations between multiple models arise, resulting in such aggregat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03937v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03937v1-abstract-full" style="display: none;"> Machine Learning models are being extensively used in safety critical applications where errors from these models could cause harm to the user. Such risks are amplified when multiple machine learning models, which are deployed concurrently, interact and make errors simultaneously. This paper explores three scenarios where error correlations between multiple models arise, resulting in such aggregated risks. Using real-world data, we simulate these scenarios and quantify the correlations in errors of different models. Our findings indicate that aggregated risks are substantial, particularly when models share similar algorithms, training datasets, or foundational models. Overall, we observe that correlations across models are pervasive and likely to intensify with increased reliance on foundational models and widely used public datasets, highlighting the need for effective mitigation strategies to address these challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03937v1-abstract-full').style.display = 'none'; document.getElementById('2502.03937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03806">arXiv:2502.03806</a> <span> [<a href="https://arxiv.org/pdf/2502.03806">pdf</a>, <a href="https://arxiv.org/format/2502.03806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Should Code Models Learn Pedagogically? A Preliminary Evaluation of Curriculum Learning for Real-World Software Engineering Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khant%2C+K+S">Kyi Shin Khant</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+Y">Hong Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Thongtanunam%2C+P">Patanamon Thongtanunam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03806v1-abstract-short" style="display: inline;"> Learning-based techniques, especially advanced pre-trained models for code have demonstrated capabilities in code understanding and generation, solving diverse software engineering (SE) tasks. Despite the promising results, current training approaches may not fully optimize model performance, as they typically involve learning from randomly shuffled training data. Recent work shows that Curriculum… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03806v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03806v1-abstract-full" style="display: none;"> Learning-based techniques, especially advanced pre-trained models for code have demonstrated capabilities in code understanding and generation, solving diverse software engineering (SE) tasks. Despite the promising results, current training approaches may not fully optimize model performance, as they typically involve learning from randomly shuffled training data. Recent work shows that Curriculum Learning (CL) can improve performance on code-related tasks through incremental learning based on the difficulty of synthetic code. Yet, the effectiveness of CL with conventional difficulty measures in SE tasks remains largely unexplored. In this study, we explore two conventional code metrics: code length and cyclomatic complexity to determine the difficulty levels. We investigate how the pre-trained code model (CodeT5) learns under CL, through the tasks of code clone detection and code summarization. Our empirical study on the CodeXGLUE benchmark showed contrasting results to prior studies, where the model exhibited signs of catastrophic forgetting and shortcut learning. Surprisingly, model performance saturates after only the first quartile of training, potentially indicating a limit in the model's representation capacity and/or the task's inherent difficulty. Future work should further explore various CL strategies with different code models across a wider range of SE tasks for a more holistic understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03806v1-abstract-full').style.display = 'none'; document.getElementById('2502.03806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by the 22nd International Conference on Mining Software Repositories (MSR 25)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03772">arXiv:2502.03772</a> <span> [<a href="https://arxiv.org/pdf/2502.03772">pdf</a>, <a href="https://arxiv.org/format/2502.03772">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Retrospective Systematic Study on Hierarchical Sparse Query Transformer-assisted Ultrasound Screening for Early Hepatocellular Carcinoma </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=She%2C+C">Chaoyin She</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+R">Ruifang Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+D">Danni He</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+J">Jiayi Lv</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yadan Lin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+M">Meiqing Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hui Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lida Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qinghua Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03772v1-abstract-short" style="display: inline;"> Hepatocellular carcinoma (HCC) ranks as the third leading cause of cancer-related mortality worldwide, with early detection being crucial for improving patient survival rates. However, early screening for HCC using ultrasound suffers from insufficient sensitivity and is highly dependent on the expertise of radiologists for interpretation. Leveraging the latest advancements in artificial intelligen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03772v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03772v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03772v1-abstract-full" style="display: none;"> Hepatocellular carcinoma (HCC) ranks as the third leading cause of cancer-related mortality worldwide, with early detection being crucial for improving patient survival rates. However, early screening for HCC using ultrasound suffers from insufficient sensitivity and is highly dependent on the expertise of radiologists for interpretation. Leveraging the latest advancements in artificial intelligence (AI) in medical imaging, this study proposes an innovative Hierarchical Sparse Query Transformer (HSQformer) model that combines the strengths of Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) to enhance the accuracy of HCC diagnosis in ultrasound screening. The HSQformer leverages sparse latent space representations to capture hierarchical details at various granularities without the need for complex adjustments, and adopts a modular, plug-and-play design philosophy, ensuring the model's versatility and ease of use. The HSQformer's performance was rigorously tested across three distinct clinical scenarios: single-center, multi-center, and high-risk patient testing. In each of these settings, it consistently outperformed existing state-of-the-art models, such as ConvNext and SwinTransformer. Notably, the HSQformer even matched the diagnostic capabilities of senior radiologists and comprehensively surpassed those of junior radiologists. The experimental results from this study strongly demonstrate the effectiveness and clinical potential of AI-assisted tools in HCC screening. The full code is available at https://github.com/Asunatan/HSQformer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03772v1-abstract-full').style.display = 'none'; document.getElementById('2502.03772v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03493">arXiv:2502.03493</a> <span> [<a href="https://arxiv.org/pdf/2502.03493">pdf</a>, <a href="https://arxiv.org/format/2502.03493">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MetaFE-DE: Learning Meta Feature Embedding for Depth Estimation from Monocular Endoscopic Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dawei Lu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+D">Deqiang Xiao</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+D">Danni Ai</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Jingfan Fan</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tianyu Fu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yucong Lin</a>, <a href="/search/cs?searchtype=author&query=Song%2C+H">Hong Song</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xujiong Ye</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jian Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03493v1-abstract-short" style="display: inline;"> Depth estimation from monocular endoscopic images presents significant challenges due to the complexity of endoscopic surgery, such as irregular shapes of human soft tissues, as well as variations in lighting conditions. Existing methods primarily estimate the depth information from RGB images directly, and often surffer the limited interpretability and accuracy. Given that RGB and depth images ar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03493v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03493v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03493v1-abstract-full" style="display: none;"> Depth estimation from monocular endoscopic images presents significant challenges due to the complexity of endoscopic surgery, such as irregular shapes of human soft tissues, as well as variations in lighting conditions. Existing methods primarily estimate the depth information from RGB images directly, and often surffer the limited interpretability and accuracy. Given that RGB and depth images are two views of the same endoscopic surgery scene, in this paper, we introduce a novel concept referred as ``meta feature embedding (MetaFE)", in which the physical entities (e.g., tissues and surgical instruments) of endoscopic surgery are represented using the shared features that can be alternatively decoded into RGB or depth image. With this concept, we propose a two-stage self-supervised learning paradigm for the monocular endoscopic depth estimation. In the first stage, we propose a temporal representation learner using diffusion models, which are aligned with the spatial information through the cross normalization to construct the MetaFE. In the second stage, self-supervised monocular depth estimation with the brightness calibration is applied to decode the meta features into the depth image. Extensive evaluation on diverse endoscopic datasets demonstrates that our approach outperforms the state-of-the-art method in depth estimation, achieving superior accuracy and generalization. The source code will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03493v1-abstract-full').style.display = 'none'; document.getElementById('2502.03493v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.02757">arXiv:2502.02757</a> <span> [<a href="https://arxiv.org/pdf/2502.02757">pdf</a>, <a href="https://arxiv.org/format/2502.02757">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Too Noisy To Learn: Enhancing Data Quality for Code Review Comment Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chunhua Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H+Y">Hong Yi Lin</a>, <a href="/search/cs?searchtype=author&query=Thongtanunam%2C+P">Patanamon Thongtanunam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.02757v2-abstract-short" style="display: inline;"> Code review is an important practice in software development, yet it is time-consuming and requires substantial effort. While open-source datasets have been used to train neural models for automating code review tasks, including review comment generation, these datasets contain a significant amount of noisy comments (e.g., vague or non-actionable feedback) that persist despite cleaning methods usi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02757v2-abstract-full').style.display = 'inline'; document.getElementById('2502.02757v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.02757v2-abstract-full" style="display: none;"> Code review is an important practice in software development, yet it is time-consuming and requires substantial effort. While open-source datasets have been used to train neural models for automating code review tasks, including review comment generation, these datasets contain a significant amount of noisy comments (e.g., vague or non-actionable feedback) that persist despite cleaning methods using heuristics and machine learning approaches. Such remaining noise may lead models to generate low-quality review comments, yet removing them requires a complex semantic understanding of both code changes and natural language comments. In this paper, we investigate the impact of such noise on review comment generation and propose a novel approach using large language models (LLMs) to further clean these datasets. Based on an empirical study on a large-scale code review dataset, our LLM-based approach achieves 66-85% precision in detecting valid comments. Using the predicted valid comments to fine-tune the state-of-the-art code review models (cleaned models) can generate review comments that are 13.0% - 12.4% more similar to valid human-written comments than the original models. We also find that the cleaned models can generate more informative and relevant comments than the original models. Our findings underscore the critical impact of dataset quality on the performance of review comment generation. We advocate for further research into cleaning training data to enhance the practical utility and quality of automated code review. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.02757v2-abstract-full').style.display = 'none'; document.getElementById('2502.02757v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper is published at the International Conference on Mining Software Repositories (MSR2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01776">arXiv:2502.01776</a> <span> [<a href="https://arxiv.org/pdf/2502.01776">pdf</a>, <a href="https://arxiv.org/format/2502.01776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Sparse VideoGen: Accelerating Video Diffusion Transformers with Spatial-Temporal Sparsity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xi%2C+H">Haocheng Xi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuo Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yilong Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenfeng Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiuyu Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujun Lin</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Han Cai</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dacheng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a>, <a href="/search/cs?searchtype=author&query=Stoica%2C+I">Ion Stoica</a>, <a href="/search/cs?searchtype=author&query=Keutzer%2C+K">Kurt Keutzer</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01776v1-abstract-short" style="display: inline;"> Diffusion Transformers (DiTs) dominate video generation but their high computational cost severely limits real-world applicability, usually requiring tens of minutes to generate a few seconds of video even on high-performance GPUs. This inefficiency primarily arises from the quadratic computational complexity of 3D Full Attention with respect to the context length. In this paper, we propose a trai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01776v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01776v1-abstract-full" style="display: none;"> Diffusion Transformers (DiTs) dominate video generation but their high computational cost severely limits real-world applicability, usually requiring tens of minutes to generate a few seconds of video even on high-performance GPUs. This inefficiency primarily arises from the quadratic computational complexity of 3D Full Attention with respect to the context length. In this paper, we propose a training-free framework termed Sparse VideoGen (SVG) that leverages the inherent sparsity in 3D Full Attention to boost inference efficiency. We reveal that the attention heads can be dynamically classified into two groups depending on distinct sparse patterns: (1) Spatial Head, where only spatially-related tokens within each frame dominate the attention output, and (2) Temporal Head, where only temporally-related tokens across different frames dominate. Based on this insight, SVG proposes an online profiling strategy to capture the dynamic sparse patterns and predicts the type of attention head. Combined with a novel hardware-efficient tensor layout transformation and customized kernel implementations, SVG achieves up to 2.28x and 2.33x end-to-end speedup on CogVideoX-v1.5 and HunyuanVideo, respectively, while preserving generation quality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01776v1-abstract-full').style.display = 'none'; document.getElementById('2502.01776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 8 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01587">arXiv:2502.01587</a> <span> [<a href="https://arxiv.org/pdf/2502.01587">pdf</a>, <a href="https://arxiv.org/format/2502.01587">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Verbalized Bayesian Persuasion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenhao Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yue Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiangfeng Wang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bo Jin</a>, <a href="/search/cs?searchtype=author&query=Zha%2C+H">Hongyuan Zha</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Baoxiang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01587v1-abstract-short" style="display: inline;"> Information design (ID) explores how a sender influence the optimal behavior of receivers to achieve specific objectives. While ID originates from everyday human communication, existing game-theoretic and machine learning methods often model information structures as numbers, which limits many applications to toy games. This work leverages LLMs and proposes a verbalized framework in Bayesian persu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01587v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01587v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01587v1-abstract-full" style="display: none;"> Information design (ID) explores how a sender influence the optimal behavior of receivers to achieve specific objectives. While ID originates from everyday human communication, existing game-theoretic and machine learning methods often model information structures as numbers, which limits many applications to toy games. This work leverages LLMs and proposes a verbalized framework in Bayesian persuasion (BP), which extends classic BP to real-world games involving human dialogues for the first time. Specifically, we map the BP to a verbalized mediator-augmented extensive-form game, where LLMs instantiate the sender and receiver. To efficiently solve the verbalized game, we propose a generalized equilibrium-finding algorithm combining LLM and game solver. The algorithm is reinforced with techniques including verbalized commitment assumptions, verbalized obedience constraints, and information obfuscation. Numerical experiments in dialogue scenarios, such as recommendation letters, courtroom interactions, and law enforcement, validate that our framework can both reproduce theoretical results in classic BP and discover effective persuasion strategies in more complex natural language and multi-stage scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01587v1-abstract-full').style.display = 'none'; document.getElementById('2502.01587v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">63 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01458">arXiv:2502.01458</a> <span> [<a href="https://arxiv.org/pdf/2502.01458">pdf</a>, <a href="https://arxiv.org/format/2502.01458">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Understanding the Capabilities and Limitations of Weak-to-Strong Generalization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yao%2C+W">Wei Yao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wenkai Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqiao Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yankai Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01458v1-abstract-short" style="display: inline;"> Weak-to-strong generalization, where weakly supervised strong models outperform their weaker teachers, offers a promising approach to aligning superhuman models with human values. To deepen the understanding of this approach, we provide theoretical insights into its capabilities and limitations. First, in the classification setting, we establish upper and lower generalization error bounds for the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01458v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01458v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01458v1-abstract-full" style="display: none;"> Weak-to-strong generalization, where weakly supervised strong models outperform their weaker teachers, offers a promising approach to aligning superhuman models with human values. To deepen the understanding of this approach, we provide theoretical insights into its capabilities and limitations. First, in the classification setting, we establish upper and lower generalization error bounds for the strong model, identifying the primary limitations as stemming from the weak model's generalization error and the optimization objective itself. Additionally, we derive lower and upper bounds on the calibration error of the strong model. These theoretical bounds reveal two critical insights: (1) the weak model should demonstrate strong generalization performance and maintain well-calibrated predictions, and (2) the strong model's training process must strike a careful balance, as excessive optimization could undermine its generalization capability by over-relying on the weak supervision signals. Finally, in the regression setting, we extend the work of Charikar et al. (2024) to a loss function based on Kullback-Leibler (KL) divergence, offering guarantees that the strong student can outperform its weak teacher by at least the magnitude of their disagreement. We conduct sufficient experiments to validate our theory. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01458v1-abstract-full').style.display = 'none'; document.getElementById('2502.01458v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01107">arXiv:2502.01107</a> <span> [<a href="https://arxiv.org/pdf/2502.01107">pdf</a>, <a href="https://arxiv.org/format/2502.01107">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GTG: Generalizable Trajectory Generation Model for Urban Mobility </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujing Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yudong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01107v1-abstract-short" style="display: inline;"> Trajectory data mining is crucial for smart city management. However, collecting large-scale trajectory datasets is challenging due to factors such as commercial conflicts and privacy regulations. Therefore, we urgently need trajectory generation techniques to address this issue. Existing trajectory generation methods rely on the global road network structure of cities. When the road network struc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01107v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01107v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01107v1-abstract-full" style="display: none;"> Trajectory data mining is crucial for smart city management. However, collecting large-scale trajectory datasets is challenging due to factors such as commercial conflicts and privacy regulations. Therefore, we urgently need trajectory generation techniques to address this issue. Existing trajectory generation methods rely on the global road network structure of cities. When the road network structure changes, these methods are often not transferable to other cities. In fact, there exist invariant mobility patterns between different cities: 1) People prefer paths with the minimal travel cost; 2) The travel cost of roads has an invariant relationship with the topological features of the road network. Based on the above insight, this paper proposes a Generalizable Trajectory Generation model (GTG). The model consists of three parts: 1) Extracting city-invariant road representation based on Space Syntax method; 2) Cross-city travel cost prediction through disentangled adversarial training; 3) Travel preference learning by shortest path search and preference update. By learning invariant movement patterns, the model is capable of generating trajectories in new cities. Experiments on three datasets demonstrates that our model significantly outperforms existing models in terms of generalization ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01107v1-abstract-full').style.display = 'none'; document.getElementById('2502.01107v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01100">arXiv:2502.01100</a> <span> [<a href="https://arxiv.org/pdf/2502.01100">pdf</a>, <a href="https://arxiv.org/format/2502.01100">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ZebraLogic: On the Scaling Limits of LLMs for Logical Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+B+Y">Bill Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Bras%2C+R+L">Ronan Le Bras</a>, <a href="/search/cs?searchtype=author&query=Richardson%2C+K">Kyle Richardson</a>, <a href="/search/cs?searchtype=author&query=Sabharwal%2C+A">Ashish Sabharwal</a>, <a href="/search/cs?searchtype=author&query=Poovendran%2C+R">Radha Poovendran</a>, <a href="/search/cs?searchtype=author&query=Clark%2C+P">Peter Clark</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+Y">Yejin Choi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01100v1-abstract-short" style="display: inline;"> We investigate the logical reasoning capabilities of large language models (LLMs) and their scalability in complex non-monotonic reasoning. To this end, we introduce ZebraLogic, a comprehensive evaluation framework for assessing LLM reasoning performance on logic grid puzzles derived from constraint satisfaction problems (CSPs). ZebraLogic enables the generation of puzzles with controllable and qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01100v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01100v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01100v1-abstract-full" style="display: none;"> We investigate the logical reasoning capabilities of large language models (LLMs) and their scalability in complex non-monotonic reasoning. To this end, we introduce ZebraLogic, a comprehensive evaluation framework for assessing LLM reasoning performance on logic grid puzzles derived from constraint satisfaction problems (CSPs). ZebraLogic enables the generation of puzzles with controllable and quantifiable complexity, facilitating a systematic study of the scaling limits of models such as Llama, o1 models, and DeepSeek-R1. By encompassing a broad range of search space complexities and diverse logical constraints, ZebraLogic provides a structured environment to evaluate reasoning under increasing difficulty. Our results reveal a significant decline in accuracy as problem complexity grows -- a phenomenon we term the curse of complexity. This limitation persists even with larger models and increased inference-time computation, suggesting inherent constraints in current LLM reasoning capabilities. Additionally, we explore strategies to enhance logical reasoning, including Best-of-N sampling, backtracking mechanisms, and self-verification prompts. Our findings offer critical insights into the scalability of LLM reasoning, highlight fundamental limitations, and outline potential directions for improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01100v1-abstract-full').style.display = 'none'; document.getElementById('2502.01100v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://huggingface.co/spaces/WildEval/ZebraLogic</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00843">arXiv:2502.00843</a> <span> [<a href="https://arxiv.org/pdf/2502.00843">pdf</a>, <a href="https://arxiv.org/format/2502.00843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VLM-Assisted Continual learning for Visual Question Answering in Self-Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuxin Lin</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+M">Mengshi Qi</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liang Liu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+H">Huadong Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00843v1-abstract-short" style="display: inline;"> In this paper, we propose a novel approach for solving the Visual Question Answering (VQA) task in autonomous driving by integrating Vision-Language Models (VLMs) with continual learning. In autonomous driving, VQA plays a vital role in enabling the system to understand and reason about its surroundings. However, traditional models often struggle with catastrophic forgetting when sequentially expo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00843v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00843v1-abstract-full" style="display: none;"> In this paper, we propose a novel approach for solving the Visual Question Answering (VQA) task in autonomous driving by integrating Vision-Language Models (VLMs) with continual learning. In autonomous driving, VQA plays a vital role in enabling the system to understand and reason about its surroundings. However, traditional models often struggle with catastrophic forgetting when sequentially exposed to new driving tasks, such as perception, prediction, and planning, each requiring different forms of knowledge. To address this challenge, we present a novel continual learning framework that combines VLMs with selective memory replay and knowledge distillation, reinforced by task-specific projection layer regularization. The knowledge distillation allows a previously trained model to act as a "teacher" to guide the model through subsequent tasks, minimizing forgetting. Meanwhile, task-specific projection layers calculate the loss based on the divergence of feature representations, ensuring continuity in learning and reducing the shift between tasks. Evaluated on the DriveLM dataset, our framework shows substantial performance improvements, with gains ranging from 21.40% to 32.28% across various metrics. These results highlight the effectiveness of combining continual learning with VLMs in enhancing the resilience and reliability of VQA systems in autonomous driving. We will release our source code. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00843v1-abstract-full').style.display = 'none'; document.getElementById('2502.00843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00792">arXiv:2502.00792</a> <span> [<a href="https://arxiv.org/pdf/2502.00792">pdf</a>, <a href="https://arxiv.org/format/2502.00792">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RTBAgent: A LLM-based Agent System for Real-Time Bidding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+L">Leng Cai</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junxuan He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yikai Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Junjie Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuanping Lin</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Z">Ziming Quan</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Y">Yawen Zeng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00792v1-abstract-short" style="display: inline;"> Real-Time Bidding (RTB) enables advertisers to place competitive bids on impression opportunities instantaneously, striving for cost-effectiveness in a highly competitive landscape. Although RTB has widely benefited from the utilization of technologies such as deep learning and reinforcement learning, the reliability of related methods often encounters challenges due to the discrepancies between o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00792v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00792v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00792v1-abstract-full" style="display: none;"> Real-Time Bidding (RTB) enables advertisers to place competitive bids on impression opportunities instantaneously, striving for cost-effectiveness in a highly competitive landscape. Although RTB has widely benefited from the utilization of technologies such as deep learning and reinforcement learning, the reliability of related methods often encounters challenges due to the discrepancies between online and offline environments and the rapid fluctuations of online bidding. To handle these challenges, RTBAgent is proposed as the first RTB agent system based on large language models (LLMs), which synchronizes real competitive advertising bidding environments and obtains bidding prices through an integrated decision-making process. Specifically, obtaining reasoning ability through LLMs, RTBAgent is further tailored to be more professional for RTB via involved auxiliary modules, i.e., click-through rate estimation model, expert strategy knowledge, and daily reflection. In addition, we propose a two-step decision-making process and multi-memory retrieval mechanism, which enables RTBAgent to review historical decisions and transaction records and subsequently make decisions more adaptive to market changes in real-time bidding. Empirical testing with real advertising datasets demonstrates that RTBAgent significantly enhances profitability. The RTBAgent code will be publicly accessible at: https://github.com/CaiLeng/RTBAgent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00792v1-abstract-full').style.display = 'none'; document.getElementById('2502.00792v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00674">arXiv:2502.00674</a> <span> [<a href="https://arxiv.org/pdf/2502.00674">pdf</a>, <a href="https://arxiv.org/format/2502.00674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Mixture-of-Agents: Is Mixing Different Large Language Models Beneficial? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenzhe Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yong Lin</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+M">Mengzhou Xia</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+C">Chi Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00674v1-abstract-short" style="display: inline;"> Ensembling outputs from diverse sources is a straightforward yet effective approach to boost performance. Mixture-of-Agents (MoA) is one such popular ensemble method that aggregates outputs from multiple different Large Language Models (LLMs). This paper raises the question in the context of language models: is mixing different LLMs truly beneficial? We propose Self-MoA -- an ensemble method that… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00674v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00674v1-abstract-full" style="display: none;"> Ensembling outputs from diverse sources is a straightforward yet effective approach to boost performance. Mixture-of-Agents (MoA) is one such popular ensemble method that aggregates outputs from multiple different Large Language Models (LLMs). This paper raises the question in the context of language models: is mixing different LLMs truly beneficial? We propose Self-MoA -- an ensemble method that aggregates outputs from only the single top-performing LLM. Our extensive experiments reveal that, surprisingly, Self-MoA outperforms standard MoA that mixes different LLMs in a large number of scenarios: Self-MoA achieves $6.6\%$ improvement over MoA on the AlpacaEval 2.0 benchmark, and an average of $3.8\%$ improvement across various benchmarks, including MMLU, CRUX, and MATH. Applying Self-MoA to one of the top-ranking models in AlpacaEval 2.0 directly achieves the new state-of-the-art performance on the leaderboard. To understand the effectiveness of Self-MoA, we systematically investigate the trade-off between diversity and quality of outputs under various MoA settings. We confirm that the MoA performance is rather sensitive to the quality, and mixing different LLMs often lowers the average quality of the models. To complement the study, we identify the scenarios where mixing different LLMs could be helpful. This paper further introduces a sequential version of Self-MoA, that is capable of aggregating a large number of LLM outputs on-the-fly over multiple rounds, and is as effective as aggregating all outputs at once. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00674v1-abstract-full').style.display = 'none'; document.getElementById('2502.00674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00219">arXiv:2502.00219</a> <span> [<a href="https://arxiv.org/pdf/2502.00219">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Team Size and Its Negative Impact on the Disruption Index </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yiling Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linzhuo Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+L">Lingfei Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00219v1-abstract-short" style="display: inline;"> As science transitions from the age of lone geniuses to an era of collaborative teams, the question of whether large teams can sustain the creativity of individuals and continue driving innovation has become increasingly important. Our previous research first revealed a negative relationship between team size and the Disruption Index-a network-based metric of innovation-by analyzing 65 million pro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00219v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00219v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00219v1-abstract-full" style="display: none;"> As science transitions from the age of lone geniuses to an era of collaborative teams, the question of whether large teams can sustain the creativity of individuals and continue driving innovation has become increasingly important. Our previous research first revealed a negative relationship between team size and the Disruption Index-a network-based metric of innovation-by analyzing 65 million projects across papers, patents, and software over half a century. This work has sparked lively debates within the scientific community about the robustness of the Disruption Index in capturing the impact of team size on innovation. Here, we present additional evidence that the negative link between team size and disruption holds, even when accounting for factors such as reference length, citation impact, and historical time. We further show how a narrow 5-year window for measuring disruption can misrepresent this relationship as positive, underestimating the long-term disruptive potential of small teams. Like "sleeping beauties," small teams need a decade or more to see their transformative contributions to science. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00219v1-abstract-full').style.display = 'none'; document.getElementById('2502.00219v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18982">arXiv:2501.18982</a> <span> [<a href="https://arxiv.org/pdf/2501.18982">pdf</a>, <a href="https://arxiv.org/format/2501.18982">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OmniPhysGS: 3D Constitutive Gaussians for General Physics-Based Dynamics Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuchen Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chenguo Lin</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jianjin Xu</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+Y">Yadong Mu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18982v1-abstract-short" style="display: inline;"> Recently, significant advancements have been made in the reconstruction and generation of 3D assets, including static cases and those with physical interactions. To recover the physical properties of 3D assets, existing methods typically assume that all materials belong to a specific predefined category (e.g., elasticity). However, such assumptions ignore the complex composition of multiple hetero… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18982v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18982v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18982v1-abstract-full" style="display: none;"> Recently, significant advancements have been made in the reconstruction and generation of 3D assets, including static cases and those with physical interactions. To recover the physical properties of 3D assets, existing methods typically assume that all materials belong to a specific predefined category (e.g., elasticity). However, such assumptions ignore the complex composition of multiple heterogeneous objects in real scenarios and tend to render less physically plausible animation given a wider range of objects. We propose OmniPhysGS for synthesizing a physics-based 3D dynamic scene composed of more general objects. A key design of OmniPhysGS is treating each 3D asset as a collection of constitutive 3D Gaussians. For each Gaussian, its physical material is represented by an ensemble of 12 physical domain-expert sub-models (rubber, metal, honey, water, etc.), which greatly enhances the flexibility of the proposed model. In the implementation, we define a scene by user-specified prompts and supervise the estimation of material weighting factors via a pretrained video diffusion model. Comprehensive experiments demonstrate that OmniPhysGS achieves more general and realistic physical dynamics across a broader spectrum of materials, including elastic, viscoelastic, plastic, and fluid substances, as well as interactions between different materials. Our method surpasses existing methods by approximately 3% to 16% in metrics of visual quality and text alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18982v1-abstract-full').style.display = 'none'; document.getElementById('2501.18982v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025; Project page: https://wgsxm.github.io/projects/omniphysgs/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18427">arXiv:2501.18427</a> <span> [<a href="https://arxiv.org/pdf/2501.18427">pdf</a>, <a href="https://arxiv.org/format/2501.18427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SANA 1.5: Efficient Scaling of Training-Time and Inference-Time Compute in Linear Diffusion Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+E">Enze Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junsong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuyang Zhao</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jincheng Yu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Ligeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yujun Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Muyang Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+H">Han Cai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingchen Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+D">Daquan Zhou</a>, <a href="/search/cs?searchtype=author&query=Han%2C+S">Song Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18427v2-abstract-short" style="display: inline;"> This paper presents SANA-1.5, a linear Diffusion Transformer for efficient scaling in text-to-image generation. Building upon SANA-1.0, we introduce three key innovations: (1) Efficient Training Scaling: A depth-growth paradigm that enables scaling from 1.6B to 4.8B parameters with significantly reduced computational resources, combined with a memory-efficient 8-bit optimizer. (2) Model Depth Prun… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18427v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18427v2-abstract-full" style="display: none;"> This paper presents SANA-1.5, a linear Diffusion Transformer for efficient scaling in text-to-image generation. Building upon SANA-1.0, we introduce three key innovations: (1) Efficient Training Scaling: A depth-growth paradigm that enables scaling from 1.6B to 4.8B parameters with significantly reduced computational resources, combined with a memory-efficient 8-bit optimizer. (2) Model Depth Pruning: A block importance analysis technique for efficient model compression to arbitrary sizes with minimal quality loss. (3) Inference-time Scaling: A repeated sampling strategy that trades computation for model capacity, enabling smaller models to match larger model quality at inference time. Through these strategies, SANA-1.5 achieves a text-image alignment score of 0.72 on GenEval, which can be further improved to 0.80 through inference scaling, establishing a new SoTA on GenEval benchmark. These innovations enable efficient model scaling across different compute budgets while maintaining high quality, making high-quality image generation more accessible. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18427v2-abstract-full').style.display = 'none'; document.getElementById('2501.18427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17790">arXiv:2501.17790</a> <span> [<a href="https://arxiv.org/pdf/2501.17790">pdf</a>, <a href="https://arxiv.org/format/2501.17790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> BreezyVoice: Adapting TTS for Taiwanese Mandarin with Enhanced Polyphone Disambiguation -- Challenges and Insights </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Chia-Chun Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yi-Chang Chen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chien-Yu Yu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Ming-Ji Lee</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chien-Cheng Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ru-Heng Huang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a>, <a href="/search/cs?searchtype=author&query=Shiu%2C+D">Da-Shan Shiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17790v1-abstract-short" style="display: inline;"> We present BreezyVoice, a Text-to-Speech (TTS) system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities to address the unique challenges of polyphone disambiguation in the language. Building upon CosyVoice, we incorporate a $S^{3}$ tokenizer, a large language model (LLM), an optimal-transport conditional flow matching model (OT-CFM), and a grapheme to phoneme pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17790v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17790v1-abstract-full" style="display: none;"> We present BreezyVoice, a Text-to-Speech (TTS) system specifically adapted for Taiwanese Mandarin, highlighting phonetic control abilities to address the unique challenges of polyphone disambiguation in the language. Building upon CosyVoice, we incorporate a $S^{3}$ tokenizer, a large language model (LLM), an optimal-transport conditional flow matching model (OT-CFM), and a grapheme to phoneme prediction model, to generate realistic speech that closely mimics human utterances. Our evaluation demonstrates BreezyVoice's superior performance in both general and code-switching contexts, highlighting its robustness and effectiveness in generating high-fidelity speech. Additionally, we address the challenges of generalizability in modeling long-tail speakers and polyphone disambiguation. Our approach significantly enhances performance and offers valuable insights into the workings of neural codec TTS systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17790v1-abstract-full').style.display = 'none'; document.getElementById('2501.17790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15275">arXiv:2501.15275</a> <span> [<a href="https://arxiv.org/pdf/2501.15275">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applied Physics">physics.app-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Mesoscale and Nanoscale Physics">cond-mat.mes-hall</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> A Tale of Two Sides of Wafer: Physical Implementation and Block-Level PPA on Flip FET with Dual-sided Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+H">Haoran Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xun Jiang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Y">Yanbang Chu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Z">Ziqiao Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Rui Guo</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+W">Wanyue Peng</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yibo Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Heng Wu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ru Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15275v1-abstract-short" style="display: inline;"> As the conventional scaling of logic devices comes to an end, functional wafer backside and 3D transistor stacking are consensus for next-generation logic technology, offering considerable design space extension for powers, signals or even devices on the wafer backside. The Flip FET (FFET), a novel transistor architecture combining 3D transistor stacking and fully functional wafer backside, was re… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15275v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15275v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15275v1-abstract-full" style="display: none;"> As the conventional scaling of logic devices comes to an end, functional wafer backside and 3D transistor stacking are consensus for next-generation logic technology, offering considerable design space extension for powers, signals or even devices on the wafer backside. The Flip FET (FFET), a novel transistor architecture combining 3D transistor stacking and fully functional wafer backside, was recently proposed. With symmetric dual-sided standard cell design, the FFET can deliver around 12.5% cell area scaling and faster but more energy-efficient libraries beyond other stacked transistor technologies such as CFET. Besides, thanks to the novel cell design with dual-sided pins, the FFET supports dual-sided signal routing, delivering better routability and larger backside design space. In this work, we demonstrated a comprehensive FFET evaluation framework considering physical implementation and block-level power-performance-area (PPA) assessment for the first time, in which key functions are dual-sided routing and dual-sided RC extraction. A 32-bit RISC-V core was used for the evaluation here. Compared to the CFET with single-sided signals, the FFET with single-sided signals achieved 23.3% post-P&R core area reduction, 25.0% higher frequency and 11.9% lower power at the same utilization, and 16.0 % higher frequency at the same core area. Meanwhile, the FFET supports dual-sided signals, which can further benefit more from flexible allocation of cell input pins on both sides. By optimizing the input pin density and BEOL routing layer number on each side, 10.6% frequency gain was realized without power degradation compared to the one with single-sided signal routing. Moreover, the routability and power efficiency of FFET barely degrades even with the routing layer number reduced from 12 to 5 on each side, validating the great space for cost-friendly design enabled by FFET. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15275v1-abstract-full').style.display = 'none'; document.getElementById('2501.15275v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by DATE 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. of DATE 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14960">arXiv:2501.14960</a> <span> [<a href="https://arxiv.org/pdf/2501.14960">pdf</a>, <a href="https://arxiv.org/format/2501.14960">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM4DistReconfig: A Fine-tuned Large Language Model for Power Distribution Network Reconfiguration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Christou%2C+P">Panayiotis Christou</a>, <a href="/search/cs?searchtype=author&query=Islam%2C+M+Z">Md. Zahidul Islam</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuzhang Lin</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jingwei Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14960v2-abstract-short" style="display: inline;"> Power distribution networks are evolving due to the integration of DERs and increased customer participation. To maintain optimal operation, minimize losses, and meet varying load demands, frequent network reconfiguration is necessary. Traditionally, the reconfiguration task relies on optimization software and expert operators, but as systems grow more complex, faster and more adaptive solutions a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14960v2-abstract-full').style.display = 'inline'; document.getElementById('2501.14960v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14960v2-abstract-full" style="display: none;"> Power distribution networks are evolving due to the integration of DERs and increased customer participation. To maintain optimal operation, minimize losses, and meet varying load demands, frequent network reconfiguration is necessary. Traditionally, the reconfiguration task relies on optimization software and expert operators, but as systems grow more complex, faster and more adaptive solutions are required without expert intervention. Data-driven reconfiguration is gaining traction for its accuracy, speed, and robustness against incomplete network data. LLMs, with their ability to capture complex patterns, offer a promising approach for efficient and responsive network reconfiguration in evolving complex power networks. In this work, we introduce LLM4DistReconfig, a deep learning-based approach utilizing a fine-tuned LLM to solve the distribution network reconfiguration problem. By carefully crafting prompts and designing a custom loss function, we train the LLM with inputs representing network parameters such as buses, available lines, open lines, node voltages, and system loss. The model then predicts optimal reconfigurations by outputting updated network configurations that minimize system loss while meeting operational constraints. Our approach significantly reduces inference time compared to classical algorithms, allowing for near real-time optimal reconfiguration after training. Experimental results show that our method generates optimal configurations minimizing system loss for five individual and a combined test dataset. It also produces minimal invalid edges, no cycles, or subgraphs across all datasets, fulfilling domain-specific needs. Additionally, the generated responses contain less than 5% improper outputs on seen networks and satisfactory results on unseen networks, demonstrating its effectiveness and reliability for the reconfiguration task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14960v2-abstract-full').style.display = 'none'; document.getElementById('2501.14960v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in NAACL 2025 Conference Main Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14371">arXiv:2501.14371</a> <span> [<a href="https://arxiv.org/pdf/2501.14371">pdf</a>, <a href="https://arxiv.org/format/2501.14371">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DRESSing Up LLM: Efficient Stylized Question-Answering via Style Subspace Editing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xinyu Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yifeng Xu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yang Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianlong Wang</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+X">Xu Chu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xin Gao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+J">Junfeng Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yasha Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14371v1-abstract-short" style="display: inline;"> We introduce DRESS, a novel approach for generating stylized large language model (LLM) responses through representation editing. Existing methods like prompting and fine-tuning are either insufficient for complex style adaptation or computationally expensive, particularly in tasks like NPC creation or character role-playing. Our approach leverages the over-parameterized nature of LLMs to disentan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14371v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14371v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14371v1-abstract-full" style="display: none;"> We introduce DRESS, a novel approach for generating stylized large language model (LLM) responses through representation editing. Existing methods like prompting and fine-tuning are either insufficient for complex style adaptation or computationally expensive, particularly in tasks like NPC creation or character role-playing. Our approach leverages the over-parameterized nature of LLMs to disentangle a style-relevant subspace within the model's representation space to conduct representation editing, ensuring a minimal impact on the original semantics. By applying adaptive editing strengths, we dynamically adjust the steering vectors in the style subspace to maintain both stylistic fidelity and semantic integrity. We develop two stylized QA benchmark datasets to validate the effectiveness of DRESS, and the results demonstrate significant improvements compared to baseline methods such as prompting and ITI. In short, DRESS is a lightweight, train-free solution for enhancing LLMs with flexible and effective style control, making it particularly useful for developing stylized conversational agents. Codes and benchmark datasets are available at https://github.com/ArthurLeoM/DRESS-LLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14371v1-abstract-full').style.display = 'none'; document.getElementById('2501.14371v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025 Accepted</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Lin%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>