CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 758 results for author: <span class="mathjax">Yuan, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yuan%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yuan, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yuan%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yuan, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11718">arXiv:2502.11718</a> <span> [<a href="https://arxiv.org/pdf/2502.11718">pdf</a>, <a href="https://arxiv.org/format/2502.11718">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> "See the World, Discover Knowledge": A Chinese Factuality Evaluation for Large Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jihao Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+P">Pi Bu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziming Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+T">Tengtao Song</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+D">Donglai Wei</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiale Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yingxiu Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yancheng He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shilong Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaheng Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+M">Meng Cao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jun Song</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Y">Yingshui Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+W">Wenbo Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhicheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaoyong Zhu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11718v1-abstract-short" style="display: inline;"> The evaluation of factual accuracy in large vision language models (LVLMs) has lagged behind their rapid development, making it challenging to fully reflect these models' knowledge capacity and reliability. In this paper, we introduce the first factuality-based visual question-answering benchmark in Chinese, named ChineseSimpleVQA, aimed at assessing the visual factuality of LVLMs across 8 major t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11718v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11718v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11718v1-abstract-full" style="display: none;"> The evaluation of factual accuracy in large vision language models (LVLMs) has lagged behind their rapid development, making it challenging to fully reflect these models' knowledge capacity and reliability. In this paper, we introduce the first factuality-based visual question-answering benchmark in Chinese, named ChineseSimpleVQA, aimed at assessing the visual factuality of LVLMs across 8 major topics and 56 subtopics. The key features of this benchmark include a focus on the Chinese language, diverse knowledge types, a multi-hop question construction, high-quality data, static consistency, and easy-to-evaluate through short answers. Moreover, we contribute a rigorous data construction pipeline and decouple the visual factuality into two parts: seeing the world (i.e., object recognition) and discovering knowledge. This decoupling allows us to analyze the capability boundaries and execution mechanisms of LVLMs. Subsequently, we evaluate 34 advanced open-source and closed-source models, revealing critical performance gaps within this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11718v1-abstract-full').style.display = 'none'; document.getElementById('2502.11718v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">24 pages, 21 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11573">arXiv:2502.11573</a> <span> [<a href="https://arxiv.org/pdf/2502.11573">pdf</a>, <a href="https://arxiv.org/format/2502.11573">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> InfiR : Crafting Effective Small Language Models and Multimodal Small Language Models in Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+C">Congkai Xie</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+S">Shuo Cai</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenjun Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pengxiang Li</a>, <a href="/search/cs?searchtype=author&query=Sang%2C+Z">Zhijie Sang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kejing Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yiming Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhen Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guanghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zeyu Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yang Yu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhang Liu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+S">Su Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Baoyi He</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qi Zhou</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xiaotian Han</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jianbo Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fei Wu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Hongxia Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11573v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs) have made significant advancements in reasoning capabilities. However, they still face challenges such as high computational demands and privacy concerns. This paper focuses on developing efficient Small Language Models (SLMs) and Multimodal Small Language Models (MSLMs) that retain competitive reasoning abilities. We introd… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11573v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11573v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11573v1-abstract-full" style="display: none;"> Large Language Models (LLMs) and Multimodal Large Language Models (MLLMs) have made significant advancements in reasoning capabilities. However, they still face challenges such as high computational demands and privacy concerns. This paper focuses on developing efficient Small Language Models (SLMs) and Multimodal Small Language Models (MSLMs) that retain competitive reasoning abilities. We introduce a novel training pipeline that enhances reasoning capabilities and facilitates deployment on edge devices, achieving state-of-the-art performance while minimizing development costs. \InfR~ aims to advance AI systems by improving reasoning, reducing adoption barriers, and addressing privacy concerns through smaller model sizes. Resources are available at https://github. com/Reallm-Labs/InfiR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11573v1-abstract-full').style.display = 'none'; document.getElementById('2502.11573v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.11089">arXiv:2502.11089</a> <span> [<a href="https://arxiv.org/pdf/2502.11089">pdf</a>, <a href="https://arxiv.org/format/2502.11089">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huazuo Gao</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Liang Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhengyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Z">Zhenda Xie</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y+X">Y. X. Wei</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lean Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuqing Wang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+W">Wenfeng Liang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wangding Zeng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.11089v1-abstract-short" style="display: inline;"> Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with har… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11089v1-abstract-full').style.display = 'inline'; document.getElementById('2502.11089v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.11089v1-abstract-full" style="display: none;"> Long-context modeling is crucial for next-generation language models, yet the high computational cost of standard attention mechanisms poses significant computational challenges. Sparse attention offers a promising direction for improving efficiency while maintaining model capabilities. We present NSA, a Natively trainable Sparse Attention mechanism that integrates algorithmic innovations with hardware-aligned optimizations to achieve efficient long-context modeling. NSA employs a dynamic hierarchical sparse strategy, combining coarse-grained token compression with fine-grained token selection to preserve both global context awareness and local precision. Our approach advances sparse attention design with two key innovations: (1) We achieve substantial speedups through arithmetic intensity-balanced algorithm design, with implementation optimizations for modern hardware. (2) We enable end-to-end training, reducing pretraining computation without sacrificing model performance. As shown in Figure 1, experiments show the model pretrained with NSA maintains or exceeds Full Attention models across general benchmarks, long-context tasks, and instruction-based reasoning. Meanwhile, NSA achieves substantial speedups over Full Attention on 64k-length sequences across decoding, forward propagation, and backward propagation, validating its efficiency throughout the model lifecycle. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.11089v1-abstract-full').style.display = 'none'; document.getElementById('2502.11089v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10677">arXiv:2502.10677</a> <span> [<a href="https://arxiv.org/pdf/2502.10677">pdf</a>, <a href="https://arxiv.org/format/2502.10677">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> FocalCount: Towards Class-Count Imbalance in Class-Agnostic Counting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Huilin Zhu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingling Yuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhengwei Yang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+X">Xian Zhong</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shengfeng He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10677v1-abstract-short" style="display: inline;"> In class-agnostic object counting, the goal is to estimate the total number of object instances in an image without distinguishing between specific categories. Existing methods often predict this count without considering class-specific outputs, leading to inaccuracies when such outputs are required. These inaccuracies stem from two key challenges: 1) the prevalence of single-category images in da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10677v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10677v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10677v1-abstract-full" style="display: none;"> In class-agnostic object counting, the goal is to estimate the total number of object instances in an image without distinguishing between specific categories. Existing methods often predict this count without considering class-specific outputs, leading to inaccuracies when such outputs are required. These inaccuracies stem from two key challenges: 1) the prevalence of single-category images in datasets, which leads models to generalize specific categories as representative of all objects, and 2) the use of mean squared error loss during training, which applies uniform penalization. This uniform penalty disregards errors in less frequent categories, particularly when these errors contribute minimally to the overall loss. To address these issues, we propose {FocalCount}, a novel approach that leverages diverse feature attributes to estimate the number of object categories in an image. This estimate serves as a weighted factor to correct class-count imbalances. Additionally, we introduce {Focal-MSE}, a new loss function that integrates binary cross-entropy to generate stronger error gradients, enhancing the model's sensitivity to errors in underrepresented categories. Our approach significantly improves the model's ability to distinguish between specific classes and general counts, demonstrating superior performance and scalability in both few-shot and zero-shot scenarios across three object counting datasets. The code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10677v1-abstract-full').style.display = 'none'; document.getElementById('2502.10677v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10042">arXiv:2502.10042</a> <span> [<a href="https://arxiv.org/pdf/2502.10042">pdf</a>, <a href="https://arxiv.org/format/2502.10042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Scaling Law Tradeoff Between Throughput and Sensing Distance in Large ISAC Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Min Qiu</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M">Ming-Chun Lee</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yu-Chih Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10042v1-abstract-short" style="display: inline;"> In this paper, we investigate the fundamental tradeoff between communication and sensing performance of \emph{ad hoc} integrated sensing and communication (ISAC) wireless networks. Specifically, we consider that $n$ nodes are randomly located in an extended network with area $n$ and transmit ISAC signals. Under the pure path loss channel gain model and the condition that the transmission power sca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10042v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10042v1-abstract-full" style="display: none;"> In this paper, we investigate the fundamental tradeoff between communication and sensing performance of \emph{ad hoc} integrated sensing and communication (ISAC) wireless networks. Specifically, we consider that $n$ nodes are randomly located in an extended network with area $n$ and transmit ISAC signals. Under the pure path loss channel gain model and the condition that the transmission power scales according to the communication distance, we fully characterize the optimal scaling law tradeoff between throughput and sensing distance by proposing an achievable scheme and proving its converse. Our results can be interpreted as follows: by reducing the throughput by a factor of a function of $n$, the sensing range order improves according to the same function of $n$, raised to the power of the ratio between the path loss factors in communication and sensing. We prove that the same result also holds true for ISAC networks with random fading, despite the uncertainty on the connectivity and power level created by random fading. In addition, we show that the scaling law tradeoff cannot be improved by allowing the transmission power and communication distance to scale freely. To the best of our knowledge, this is the first work formally formulating and characterizing the communication and sensing performance scaling law tradeoff of \emph{ad hoc} ISAC networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10042v1-abstract-full').style.display = 'none'; document.getElementById('2502.10042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09670">arXiv:2502.09670</a> <span> [<a href="https://arxiv.org/pdf/2502.09670">pdf</a>, <a href="https://arxiv.org/format/2502.09670">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Science of Evaluating Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiayi Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiamu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+A">Andrew Wen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09670v1-abstract-short" style="display: inline;"> The emergent phenomena of large foundation models have revolutionized natural language processing. However, evaluating these models presents significant challenges due to their size, capabilities, and deployment across diverse applications. Existing literature often focuses on individual aspects, such as benchmark performance or specific tasks, but fails to provide a cohesive process that integrat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09670v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09670v1-abstract-full" style="display: none;"> The emergent phenomena of large foundation models have revolutionized natural language processing. However, evaluating these models presents significant challenges due to their size, capabilities, and deployment across diverse applications. Existing literature often focuses on individual aspects, such as benchmark performance or specific tasks, but fails to provide a cohesive process that integrates the nuances of diverse use cases with broader ethical and operational considerations. This work focuses on three key aspects: (1) Formalizing the Evaluation Process by providing a structured framework tailored to specific use-case contexts, (2) Offering Actionable Tools and Frameworks such as checklists and templates to ensure thorough, reproducible, and practical evaluations, and (3) Surveying Recent Work with a targeted review of advancements in LLM evaluation, emphasizing real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09670v1-abstract-full').style.display = 'none'; document.getElementById('2502.09670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.09027">arXiv:2502.09027</a> <span> [<a href="https://arxiv.org/pdf/2502.09027">pdf</a>, <a href="https://arxiv.org/format/2502.09027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3701716.3715206">10.1145/3701716.3715206 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Contextual-Aware Position Encoding for Sequential Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jun Yuan</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+G">Guohao Cai</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Z">Zhenhua Dong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.09027v1-abstract-short" style="display: inline;"> Sequential recommendation (SR), which encodes user activity to predict the next action, has emerged as a widely adopted strategy in developing commercial personalized recommendation systems. A critical component of modern SR models is the attention mechanism, which synthesizes users' historical activities. This mechanism is typically order-invariant and generally relies on position encoding (PE).… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09027v1-abstract-full').style.display = 'inline'; document.getElementById('2502.09027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.09027v1-abstract-full" style="display: none;"> Sequential recommendation (SR), which encodes user activity to predict the next action, has emerged as a widely adopted strategy in developing commercial personalized recommendation systems. A critical component of modern SR models is the attention mechanism, which synthesizes users' historical activities. This mechanism is typically order-invariant and generally relies on position encoding (PE). Conventional SR models simply assign a learnable vector to each position, resulting in only modest gains compared to traditional recommendation models. Moreover, limited research has been conducted on position encoding tailored for sequential recommendation, leaving a significant gap in addressing its unique requirements. To bridge this gap, we propose a novel Contextual-Aware Position Encoding method for sequential recommendation, abbreviated as CAPE. To the best of our knowledge, CAPE is the first PE method specifically designed for sequential recommendation. Comprehensive experiments conducted on benchmark SR datasets demonstrate that CAPE consistently enhances multiple mainstream backbone models and achieves state-of-the-art performance, across small and large scale model size. Furthermore, we deployed CAPE in an industrial setting on a real-world commercial platform, clearly showcasing the effectiveness of our approach. Our source code is available at https://github.com/yjdy/CAPE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.09027v1-abstract-full').style.display = 'none'; document.getElementById('2502.09027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW'25 Industry Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07027">arXiv:2502.07027</a> <span> [<a href="https://arxiv.org/pdf/2502.07027">pdf</a>, <a href="https://arxiv.org/format/2502.07027">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Representational Alignment with Chemical Induced Fit for Molecular Relational Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Peiliang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingling Yuan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qing Xie</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongjun Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07027v1-abstract-short" style="display: inline;"> Molecular Relational Learning (MRL) is widely applied in natural sciences to predict relationships between molecular pairs by extracting structural features. The representational similarity between substructure pairs determines the functional compatibility of molecular binding sites. Nevertheless, aligning substructure representations by attention mechanisms lacks guidance from chemical knowledge,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07027v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07027v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07027v1-abstract-full" style="display: none;"> Molecular Relational Learning (MRL) is widely applied in natural sciences to predict relationships between molecular pairs by extracting structural features. The representational similarity between substructure pairs determines the functional compatibility of molecular binding sites. Nevertheless, aligning substructure representations by attention mechanisms lacks guidance from chemical knowledge, resulting in unstable model performance in chemical space (\textit{e.g.}, functional group, scaffold) shifted data. With theoretical justification, we propose the \textbf{Re}presentational \textbf{Align}ment with Chemical Induced \textbf{Fit} (ReAlignFit) to enhance the stability of MRL. ReAlignFit dynamically aligns substructure representation in MRL by introducing chemical Induced Fit-based inductive bias. In the induction process, we design the Bias Correction Function based on substructure edge reconstruction to align representations between substructure pairs by simulating chemical conformational changes (dynamic combination of substructures). ReAlignFit further integrates the Subgraph Information Bottleneck during fit process to refine and optimize substructure pairs exhibiting high chemical functional compatibility, leveraging them to generate molecular embeddings. Experimental results on nine datasets demonstrate that ReAlignFit outperforms state-of-the-art models in two tasks and significantly enhances model's stability in both rule-shifted and scaffold-shifted data distributions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07027v1-abstract-full').style.display = 'none'; document.getElementById('2502.07027v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05494">arXiv:2502.05494</a> <span> [<a href="https://arxiv.org/pdf/2502.05494">pdf</a>, <a href="https://arxiv.org/format/2502.05494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Masked Autoencoder for Electrocardiogram Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Ya Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujie Yang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+J">Jianhuang Gan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiangjie Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05494v1-abstract-short" style="display: inline;"> Electrocardiogram (ECG) analysis is a fundamental tool for diagnosing cardiovascular conditions, yet anomaly detection in ECG signals remains challenging due to their inherent complexity and variability. We propose Multi-scale Masked Autoencoder for ECG anomaly detection (MMAE-ECG), a novel end-to-end framework that effectively captures both global and local dependencies in ECG data. Unlike state-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05494v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05494v1-abstract-full" style="display: none;"> Electrocardiogram (ECG) analysis is a fundamental tool for diagnosing cardiovascular conditions, yet anomaly detection in ECG signals remains challenging due to their inherent complexity and variability. We propose Multi-scale Masked Autoencoder for ECG anomaly detection (MMAE-ECG), a novel end-to-end framework that effectively captures both global and local dependencies in ECG data. Unlike state-of-the-art methods that rely on heartbeat segmentation or R-peak detection, MMAE-ECG eliminates the need for such pre-processing steps, enhancing its suitability for clinical deployment. MMAE-ECG partitions ECG signals into non-overlapping segments, with each segment assigned learnable positional embeddings. A novel multi-scale masking strategy and multi-scale attention mechanism, along with distinct positional embeddings, enable a lightweight Transformer encoder to effectively capture both local and global dependencies. The masked segments are then reconstructed using a single-layer Transformer block, with an aggregation strategy employed during inference to refine the outputs. Experimental results demonstrate that our method achieves performance comparable to state-of-the-art approaches while significantly reducing computational complexity-approximately 1/78 of the floating-point operations (FLOPs) required for inference. Ablation studies further validate the effectiveness of each component, highlighting the potential of multi-scale masked autoencoders for anomaly detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05494v1-abstract-full').style.display = 'none'; document.getElementById('2502.05494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review in a journal</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04364">arXiv:2502.04364</a> <span> [<a href="https://arxiv.org/pdf/2502.04364">pdf</a>, <a href="https://arxiv.org/format/2502.04364">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Lost in Edits? A $位$-Compass for AIGC Provenance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=You%2C+W">Wenhao You</a>, <a href="/search/cs?searchtype=author&query=Hooi%2C+B">Bryan Hooi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yiwei Wang</a>, <a href="/search/cs?searchtype=author&query=Choo%2C+E">Euijin Choo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Junsong Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi Huang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+Y">Yujun Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04364v1-abstract-short" style="display: inline;"> Recent advancements in diffusion models have driven the growth of text-guided image editing tools, enabling precise and iterative modifications of synthesized content. However, as these tools become increasingly accessible, they also introduce significant risks of misuse, emphasizing the critical need for robust attribution methods to ensure content authenticity and traceability. Despite the creat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04364v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04364v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04364v1-abstract-full" style="display: none;"> Recent advancements in diffusion models have driven the growth of text-guided image editing tools, enabling precise and iterative modifications of synthesized content. However, as these tools become increasingly accessible, they also introduce significant risks of misuse, emphasizing the critical need for robust attribution methods to ensure content authenticity and traceability. Despite the creative potential of such tools, they pose significant challenges for attribution, particularly in adversarial settings where edits can be layered to obscure an image's origins. We propose LambdaTracer, a novel latent-space attribution method that robustly identifies and differentiates authentic outputs from manipulated ones without requiring any modifications to generative or editing pipelines. By adaptively calibrating reconstruction losses, LambdaTracer remains effective across diverse iterative editing processes, whether automated through text-guided editing tools such as InstructPix2Pix and ControlNet or performed manually with editing software such as Adobe Photoshop. Extensive experiments reveal that our method consistently outperforms baseline approaches in distinguishing maliciously edited images, providing a practical solution to safeguard ownership, creativity, and credibility in the open, fast-evolving AI ecosystems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04364v1-abstract-full').style.display = 'none'; document.getElementById('2502.04364v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01456">arXiv:2502.01456</a> <span> [<a href="https://arxiv.org/pdf/2502.01456">pdf</a>, <a href="https://arxiv.org/format/2502.01456">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Process Reinforcement through Implicit Rewards </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cui%2C+G">Ganqu Cui</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Lifan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zefan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hanbin Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wendi Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bingxiang He</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+Y">Yuchen Fan</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tianyu Yu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qixin Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiarui Yuan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Huayu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kaiyan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xingtai Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+Y">Yuan Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+H">Hao Peng</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+N">Ning Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01456v1-abstract-short" style="display: inline;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01456v1-abstract-full" style="display: none;"> Dense process rewards have proven a more effective alternative to the sparse outcome-level rewards in the inference-time scaling of large language models (LLMs), particularly in tasks requiring complex multi-step reasoning. While dense rewards also offer an appealing choice for the reinforcement learning (RL) of LLMs since their fine-grained rewards have the potential to address some inherent issues of outcome rewards, such as training efficiency and credit assignment, this potential remains largely unrealized. This can be primarily attributed to the challenges of training process reward models (PRMs) online, where collecting high-quality process labels is prohibitively expensive, making them particularly vulnerable to reward hacking. To address these challenges, we propose PRIME (Process Reinforcement through IMplicit rEwards), which enables online PRM updates using only policy rollouts and outcome labels through implict process rewards. PRIME combines well with various advantage functions and forgoes the dedicated reward model training phrase that existing approaches require, substantially reducing the development overhead. We demonstrate PRIME's effectiveness on competitional math and coding. Starting from Qwen2.5-Math-7B-Base, PRIME achieves a 15.1% average improvement across several key reasoning benchmarks over the SFT model. Notably, our resulting model, Eurus-2-7B-PRIME, surpasses Qwen2.5-Math-7B-Instruct on seven reasoning benchmarks with 10% of its training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01456v1-abstract-full').style.display = 'none'; document.getElementById('2502.01456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages. Model&Code&Data available at https://github.com/PRIME-RL/PRIME</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01078">arXiv:2502.01078</a> <span> [<a href="https://arxiv.org/pdf/2502.01078">pdf</a>, <a href="https://arxiv.org/ps/2502.01078">ps</a>, <a href="https://arxiv.org/format/2502.01078">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Parallel Coding for Orthogonal Delay-Doppler Division Multiplexing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qi Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Min Qiu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01078v1-abstract-short" style="display: inline;"> This paper proposes a novel parallel coding transmission strategy and an iterative detection and decoding receiver signal processing technique for orthogonal delay-Doppler division multiplexing (ODDM) modulation. Specifically, the proposed approach employs a parallel channel encoding (PCE) scheme that consists of multiple short-length codewords for each delay-Doppler multicarrier (DDMC) symbol. Bu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01078v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01078v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01078v1-abstract-full" style="display: none;"> This paper proposes a novel parallel coding transmission strategy and an iterative detection and decoding receiver signal processing technique for orthogonal delay-Doppler division multiplexing (ODDM) modulation. Specifically, the proposed approach employs a parallel channel encoding (PCE) scheme that consists of multiple short-length codewords for each delay-Doppler multicarrier (DDMC) symbol. Building upon such a PCE transmission framework, we then introduce an iterative detection and decoding algorithm incorporating a successive decoding feedback (SDF) technique, which enables instant information exchange between the detector and decoder for each DDMC symbol. To characterize the error performance of the proposed scheme, we perform density evolution analysis considering the finite blocklength effects. Our analysis results, coupled with extensive simulations, demonstrate that the proposed PCE scheme with the SDF algorithm not only showcases a better overall performance but also requires much less decoding complexity to implement, compared to the conventional benchmark scheme that relies on a single long channel code for coding the entire ODDM frame. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01078v1-abstract-full').style.display = 'none'; document.getElementById('2502.01078v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 12 figures, accepted by IEEE Transactions on Communications</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18143">arXiv:2501.18143</a> <span> [<a href="https://arxiv.org/pdf/2501.18143">pdf</a>, <a href="https://arxiv.org/format/2501.18143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Dual-Bounded Nonlinear Optimal Transport for Size Constrained Min Cut Clustering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+F">Fangyuan Xie</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinghui Yuan</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+F">Feiping Nie</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18143v1-abstract-short" style="display: inline;"> Min cut is an important graph partitioning method. However, current solutions to the min cut problem suffer from slow speeds, difficulty in solving, and often converge to simple solutions. To address these issues, we relax the min cut problem into a dual-bounded constraint and, for the first time, treat the min cut problem as a dual-bounded nonlinear optimal transport problem. Additionally, we dev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18143v1-abstract-full').style.display = 'inline'; document.getElementById('2501.18143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18143v1-abstract-full" style="display: none;"> Min cut is an important graph partitioning method. However, current solutions to the min cut problem suffer from slow speeds, difficulty in solving, and often converge to simple solutions. To address these issues, we relax the min cut problem into a dual-bounded constraint and, for the first time, treat the min cut problem as a dual-bounded nonlinear optimal transport problem. Additionally, we develop a method for solving dual-bounded nonlinear optimal transport based on the Frank-Wolfe method (abbreviated as DNF). Notably, DNF not only solves the size constrained min cut problem but is also applicable to all dual-bounded nonlinear optimal transport problems. We prove that for convex problems satisfying Lipschitz smoothness, the DNF method can achieve a convergence rate of \(\mathcal{O}(\frac{1}{t})\). We apply the DNF method to the min cut problem and find that it achieves state-of-the-art performance in terms of both the loss function and clustering accuracy at the fastest speed, with a convergence rate of \(\mathcal{O}(\frac{1}{\sqrt{t}})\). Moreover, the DNF method for the size constrained min cut problem requires no parameters and exhibits better stability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18143v1-abstract-full').style.display = 'none'; document.getElementById('2501.18143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18080">arXiv:2501.18080</a> <span> [<a href="https://arxiv.org/pdf/2501.18080">pdf</a>, <a href="https://arxiv.org/ps/2501.18080">ps</a>, <a href="https://arxiv.org/format/2501.18080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> PAC Codes Meet CRC-Polar Codes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xinyi Gu</a>, <a href="/search/cs?searchtype=author&query=Rowshan%2C+M">Mohammad Rowshan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18080v2-abstract-short" style="display: inline;"> CRC-Polar codes under SC list decoding are well-regarded for their competitive error performance. This paper examines these codes by focusing on minimum weight codewords, breaking them down into the rows of the polar transform. Inspired by the significant impact of parity check bits and their positions, we apply a shifted rate-profile for polarization-adjusted convolutional (PS-PAC) codes, thereby… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18080v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18080v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18080v2-abstract-full" style="display: none;"> CRC-Polar codes under SC list decoding are well-regarded for their competitive error performance. This paper examines these codes by focusing on minimum weight codewords, breaking them down into the rows of the polar transform. Inspired by the significant impact of parity check bits and their positions, we apply a shifted rate-profile for polarization-adjusted convolutional (PS-PAC) codes, thereby achieving similar improvements in the weight distribution of polar codes through precoding. The results demonstrate a significant improvement in error performance, achieving up to a 0.5 dB power gain with short PS-PAC codes. Additionally, leveraging convolutional precoding in PAC codes, we adopt a continuous deployment (masking) of parity check bits derived from the remainder of continuous division of the partial message polynomial and the CRC polynomial over frozen positions in the rate-profile. This approach enhances performance for medium-length codes, with an overall improvement of 0.12 dB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18080v2-abstract-full').style.display = 'none'; document.getElementById('2501.18080v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">arXiv admin note: text overlap with arXiv:2406.01903</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17088">arXiv:2501.17088</a> <span> [<a href="https://arxiv.org/pdf/2501.17088">pdf</a>, <a href="https://arxiv.org/format/2501.17088">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Mamba-Shedder: Post-Transformer Compression for Efficient Selective Structured State Space Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%C3%B1oz%2C+J+P">J. Pablo Mu帽oz</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+N">Nilesh Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17088v1-abstract-short" style="display: inline;"> Large pre-trained models have achieved outstanding results in sequence modeling. The Transformer block and its attention mechanism have been the main drivers of the success of these models. Recently, alternative architectures, such as Selective Structured State Space Models (SSMs), have been proposed to address the inefficiencies of Transformers. This paper explores the compression of SSM-based mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17088v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17088v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17088v1-abstract-full" style="display: none;"> Large pre-trained models have achieved outstanding results in sequence modeling. The Transformer block and its attention mechanism have been the main drivers of the success of these models. Recently, alternative architectures, such as Selective Structured State Space Models (SSMs), have been proposed to address the inefficiencies of Transformers. This paper explores the compression of SSM-based models, particularly Mamba and its hybrids. We study the sensitivity of these models to the removal of selected components at different granularities to reduce the model size and computational overhead, thus improving their efficiency while maintaining accuracy. The proposed solutions, collectively referred to as Mamba-Shedder, achieve a speedup of up to 1.4x during inference, demonstrating that model efficiency can be improved by eliminating several redundancies with minimal impact on the overall model performance. The code is available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17088v1-abstract-full').style.display = 'none'; document.getElementById('2501.17088v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NAACL-25 - Main track</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16372">arXiv:2501.16372</a> <span> [<a href="https://arxiv.org/pdf/2501.16372">pdf</a>, <a href="https://arxiv.org/format/2501.16372">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Low-Rank Adapters Meet Neural Architecture Search for LLM Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%C3%B1oz%2C+J+P">J. Pablo Mu帽oz</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+N">Nilesh Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16372v1-abstract-short" style="display: inline;"> The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank repres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16372v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16372v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16372v1-abstract-full" style="display: none;"> The rapid expansion of Large Language Models (LLMs) has posed significant challenges regarding the computational resources required for fine-tuning and deployment. Recent advancements in low-rank adapters have demonstrated their efficacy in parameter-efficient fine-tuning (PEFT) of these models. This retrospective paper comprehensively discusses innovative approaches that synergize low-rank representations with Neural Architecture Search (NAS) techniques, particularly weight-sharing super-networks. Robust solutions for compressing and fine-tuning large pre-trained models are developed by integrating these methodologies. Our analysis highlights the potential of these combined strategies to democratize the use of LLMs, making them more accessible for deployment in resource-constrained environments. The resulting models exhibit reduced memory footprints and faster inference times, paving the way for more practical and scalable applications of LLMs. Models and code are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16372v1-abstract-full').style.display = 'none'; document.getElementById('2501.16372v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AAAI-25 Workshop on Connecting Low-rank Representations in AI</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15749">arXiv:2501.15749</a> <span> [<a href="https://arxiv.org/pdf/2501.15749">pdf</a>, <a href="https://arxiv.org/format/2501.15749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> LLM-powered Multi-agent Framework for Goal-oriented Learning in Intelligent Tutoring System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianfu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+Y">Yi Zhan</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+J">Jianxun Lian</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Z">Zhengyu Hu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+N+J">Nicholas Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qi Zhang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+X">Xing Xie</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15749v1-abstract-short" style="display: inline;"> Intelligent Tutoring Systems (ITSs) have revolutionized education by offering personalized learning experiences. However, as goal-oriented learning, which emphasizes efficiently achieving specific objectives, becomes increasingly important in professional contexts, existing ITSs often struggle to deliver this type of targeted learning experience. In this paper, we propose GenMentor, an LLM-powered… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15749v1-abstract-full').style.display = 'inline'; document.getElementById('2501.15749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15749v1-abstract-full" style="display: none;"> Intelligent Tutoring Systems (ITSs) have revolutionized education by offering personalized learning experiences. However, as goal-oriented learning, which emphasizes efficiently achieving specific objectives, becomes increasingly important in professional contexts, existing ITSs often struggle to deliver this type of targeted learning experience. In this paper, we propose GenMentor, an LLM-powered multi-agent framework designed to deliver goal-oriented, personalized learning within ITS. GenMentor begins by accurately mapping learners' goals to required skills using a fine-tuned LLM trained on a custom goal-to-skill dataset. After identifying the skill gap, it schedules an efficient learning path using an evolving optimization approach, driven by a comprehensive and dynamic profile of learners' multifaceted status. Additionally, GenMentor tailors learning content with an exploration-drafting-integration mechanism to align with individual learner needs. Extensive automated and human evaluations demonstrate GenMentor's effectiveness in learning guidance and content quality. Furthermore, we have deployed it in practice and also implemented it as an application. Practical human study with professional learners further highlights its effectiveness in goal alignment and resource targeting, leading to enhanced personalization. Supplementary resources are available at https://github.com/GeminiLight/gen-mentor. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15749v1-abstract-full').style.display = 'none'; document.getElementById('2501.15749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025 (Industry Track)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12948">arXiv:2501.12948</a> <span> [<a href="https://arxiv.org/pdf/2501.12948">pdf</a>, <a href="https://arxiv.org/format/2501.12948">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haowei Zhang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Junxiao Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Runxin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Q">Qihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+S">Shirong Ma</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peiyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bi%2C+X">Xiao Bi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokang Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xingkai Yu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yu Wu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z+F">Z. F. Wu</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+Z">Zhibin Gou</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Z">Zhihong Shao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuoshu Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Ziyi Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12948v1-abstract-short" style="display: inline;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12948v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12948v1-abstract-full" style="display: none;"> We introduce our first-generation reasoning models, DeepSeek-R1-Zero and DeepSeek-R1. DeepSeek-R1-Zero, a model trained via large-scale reinforcement learning (RL) without supervised fine-tuning (SFT) as a preliminary step, demonstrates remarkable reasoning capabilities. Through RL, DeepSeek-R1-Zero naturally emerges with numerous powerful and intriguing reasoning behaviors. However, it encounters challenges such as poor readability, and language mixing. To address these issues and further enhance reasoning performance, we introduce DeepSeek-R1, which incorporates multi-stage training and cold-start data before RL. DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on reasoning tasks. To support the research community, we open-source DeepSeek-R1-Zero, DeepSeek-R1, and six dense models (1.5B, 7B, 8B, 14B, 32B, 70B) distilled from DeepSeek-R1 based on Qwen and Llama. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12948v1-abstract-full').style.display = 'none'; document.getElementById('2501.12948v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09949">arXiv:2501.09949</a> <span> [<a href="https://arxiv.org/pdf/2501.09949">pdf</a>, <a href="https://arxiv.org/format/2501.09949">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MultiPruner: Balanced Structure Removal in Foundation Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mu%C3%B1oz%2C+J+P">J. Pablo Mu帽oz</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinjie Yuan</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+N">Nilesh Jain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09949v1-abstract-short" style="display: inline;"> Recently, state-of-the-art approaches for pruning large pre-trained models (LPMs) have demonstrated that the training-free removal of non-critical residual blocks in Transformers is viable for reducing model size, achieving results that outperform previous training-free pruning approaches. Motivated by these findings, we extend BlockPruner (Zhong et al., 2024) and propose MultiPruner, a pruning ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09949v1-abstract-full').style.display = 'inline'; document.getElementById('2501.09949v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09949v1-abstract-full" style="display: none;"> Recently, state-of-the-art approaches for pruning large pre-trained models (LPMs) have demonstrated that the training-free removal of non-critical residual blocks in Transformers is viable for reducing model size, achieving results that outperform previous training-free pruning approaches. Motivated by these findings, we extend BlockPruner (Zhong et al., 2024) and propose MultiPruner, a pruning approach that surpasses recent training-free pruning methods by adopting a multidimensional, iterative, fine-grained pruning strategy. In MultiPruner, multidimensional pruning reinstates the structural balance in block-pruned models by sequentially compressing along three dimensions: i) residual blocks, ii) channels of multilayer perceptrons (MLP), and iii) attention heads. This solution enhances zero-shot accuracy on downstream tasks compared to other techniques while improving model compression ratios, producing compressed models with fewer computing and memory requirements. Extensive experiments demonstrate the advantages of the proposed method across various large pre-trained models. The code and pruning configurations are available at https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09949v1-abstract-full').style.display = 'none'; document.getElementById('2501.09949v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08026">arXiv:2501.08026</a> <span> [<a href="https://arxiv.org/pdf/2501.08026">pdf</a>, <a href="https://arxiv.org/format/2501.08026">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Orthogonal Delay-Doppler Division Multiplexing Modulation with Hierarchical Mode-Based Index Modulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kehan Huang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Min Qiu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08026v1-abstract-short" style="display: inline;"> The orthogonal time frequency space with index modulation (OTFS-IM) offers flexible tradeoffs between spectral efficiency (SE) and bit error rate (BER) in doubly selective fading channels. While OTFS-IM schemes demonstrated such potential, a persistent challenge lies in the detection complexity. To address this problem, we propose the hierarchical mode-based index modulation (HMIM). HMIM introduce… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08026v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08026v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08026v1-abstract-full" style="display: none;"> The orthogonal time frequency space with index modulation (OTFS-IM) offers flexible tradeoffs between spectral efficiency (SE) and bit error rate (BER) in doubly selective fading channels. While OTFS-IM schemes demonstrated such potential, a persistent challenge lies in the detection complexity. To address this problem, we propose the hierarchical mode-based index modulation (HMIM). HMIM introduces a novel approach to modulate information bits by IM patterns, significantly simplifying the complexity of maximum a posteriori (MAP) estimation with Gaussian noise. Further, we incorporate HMIM with the recently proposed orthogonal delay-Doppler division multiplexing (ODDM) modulation, namely ODDM-HMIM, to exploit the full diversity of the delay-Doppler (DD) channel. The BER performance of ODDM-HMIM is analyzed considering a maximum likelihood (ML) detector. Our numerical results reveal that, with the same SE, HMIM can outperform conventional IM in terms of both BER and computational complexity. In addition, we propose a successive interference cancellation-based minimum mean square error (SIC-MMSE) detector for ODDM-HMIM, which enables low-complexity detection with large frame sizes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08026v1-abstract-full').style.display = 'none'; document.getElementById('2501.08026v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05179">arXiv:2501.05179</a> <span> [<a href="https://arxiv.org/pdf/2501.05179">pdf</a>, <a href="https://arxiv.org/format/2501.05179">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Compression with Global Guidance: Towards Training-free High-Resolution MLLMs Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuyang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziming Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yuhang Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yingyao Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiale Yuan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jun Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Linfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siteng Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Honggang Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05179v3-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) have attracted considerable attention due to their exceptional performance in visual content understanding and reasoning. However, their inference efficiency has been a notable concern, as the increasing length of multimodal contexts leads to quadratic complexity. Token compression techniques, which reduce the number of visual tokens, have demonstrated thei… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05179v3-abstract-full').style.display = 'inline'; document.getElementById('2501.05179v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05179v3-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) have attracted considerable attention due to their exceptional performance in visual content understanding and reasoning. However, their inference efficiency has been a notable concern, as the increasing length of multimodal contexts leads to quadratic complexity. Token compression techniques, which reduce the number of visual tokens, have demonstrated their effectiveness in reducing computational costs. Yet, these approaches have struggled to keep pace with the rapid advancements in MLLMs, especially the AnyRes strategy in the context of high-resolution image understanding. In this paper, we propose a novel token compression method, GlobalCom$^2$, tailored for high-resolution MLLMs that receive both the thumbnail and multiple crops. GlobalCom$^2$ treats the tokens derived from the thumbnail as the "commander" of the entire token compression process, directing the allocation of retention ratios and the specific compression for each crop. In this way, redundant tokens are eliminated while important local details are adaptively preserved to the highest extent feasible. Empirical results across 10 benchmarks reveal that GlobalCom$^2$ achieves an optimal balance between performance and efficiency, and consistently outperforms state-of-the-art token compression methods with LLaVA-NeXT-7B/13B models. Our code is released at https://github.com/xuyang-liu16/GlobalCom2. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05179v3-abstract-full').style.display = 'none'; document.getElementById('2501.05179v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04987">arXiv:2501.04987</a> <span> [<a href="https://arxiv.org/pdf/2501.04987">pdf</a>, <a href="https://arxiv.org/format/2501.04987">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> TreeKV: Smooth Key-Value Cache Compression with Tree Structures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=He%2C+Z">Ziwei He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jian Yuan</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+H">Haoli Bai</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+J">Jingwen Leng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Bo Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04987v2-abstract-short" style="display: inline;"> Efficient key-value (KV) cache compression is critical for scaling transformer-based Large Language Models (LLMs) in long sequences and resource-limited settings. Existing methods evict tokens based on their positions or importance scores, but position-based strategies can miss crucial information outside predefined regions, while those relying on global importance scores resulting in strong regio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04987v2-abstract-full').style.display = 'inline'; document.getElementById('2501.04987v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04987v2-abstract-full" style="display: none;"> Efficient key-value (KV) cache compression is critical for scaling transformer-based Large Language Models (LLMs) in long sequences and resource-limited settings. Existing methods evict tokens based on their positions or importance scores, but position-based strategies can miss crucial information outside predefined regions, while those relying on global importance scores resulting in strong regional biases, limiting the KV cache's overall context retention and potentially impairing the performance of LLMs on complex tasks. Our wavelet analysis reveals that as tokens approach the end of sequence, their contributions to generation gradually increase and tends to diverge more from neighboring tokens, indicating a smooth transition with increasing complexity and variability from distant to nearby context. Motivated by this observation, we propose TreeKV, an intuitive, training-free method that employs a tree structure for smooth cache compression. TreeKV maintains a fixed cache size, allowing LLMs to deliver high-quality output even in long text scenarios. Unlike most compression methods, TreeKV is applicable to both the generation and prefilling stages. TreeKV consistently surpasses all baseline models in language modeling tasks on PG19 and OpenWebText2, allowing LLMs trained with short context window to generalize to longer window with a 16x cache reduction. On the Longbench benchmark, TreeKV achieves the best performance with only 6\% of the budget at optimal efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04987v2-abstract-full').style.display = 'none'; document.getElementById('2501.04987v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04150">arXiv:2501.04150</a> <span> [<a href="https://arxiv.org/pdf/2501.04150">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Large and Small MLLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+X">Xuelu Feng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsheng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Dongdong Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+M">Mei Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengchen Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Junsong Yuan</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+C">Chunming Qiao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04150v1-abstract-short" style="display: inline;"> Large multimodal language models (MLLMs) such as GPT-4V and GPT-4o have achieved remarkable advancements in understanding and generating multimodal content, showcasing superior quality and capabilities across diverse tasks. However, their deployment faces significant challenges, including slow inference, high computational cost, and impracticality for on-device applications. In contrast, the emerg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04150v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04150v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04150v1-abstract-full" style="display: none;"> Large multimodal language models (MLLMs) such as GPT-4V and GPT-4o have achieved remarkable advancements in understanding and generating multimodal content, showcasing superior quality and capabilities across diverse tasks. However, their deployment faces significant challenges, including slow inference, high computational cost, and impracticality for on-device applications. In contrast, the emergence of small MLLMs, exemplified by the LLava-series models and Phi-3-Vision, offers promising alternatives with faster inference, reduced deployment costs, and the ability to handle domain-specific scenarios. Despite their growing presence, the capability boundaries between large and small MLLMs remain underexplored. In this work, we conduct a systematic and comprehensive evaluation to benchmark both small and large MLLMs, spanning general capabilities such as object recognition, temporal reasoning, and multimodal comprehension, as well as real-world applications in domains like industry and automotive. Our evaluation reveals that small MLLMs can achieve comparable performance to large models in specific scenarios but lag significantly in complex tasks requiring deeper reasoning or nuanced understanding. Furthermore, we identify common failure cases in both small and large MLLMs, highlighting domains where even state-of-the-art models struggle. We hope our findings will guide the research community in pushing the quality boundaries of MLLMs, advancing their usability and effectiveness across diverse applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04150v1-abstract-full').style.display = 'none'; document.getElementById('2501.04150v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03916">arXiv:2501.03916</a> <span> [<a href="https://arxiv.org/pdf/2501.03916">pdf</a>, <a href="https://arxiv.org/format/2501.03916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03916v2-abstract-short" style="display: inline;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03916v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03916v2-abstract-full" style="display: none;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'none'; document.getElementById('2501.03916v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures, and our homepage: https://alpha-innovator.github.io/Dolphin-project-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03689">arXiv:2501.03689</a> <span> [<a href="https://arxiv.org/pdf/2501.03689">pdf</a>, <a href="https://arxiv.org/format/2501.03689">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> MAJL: A Model-Agnostic Joint Learning Framework for Music Source Separation and Pitch Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+H">Haojie Wei</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jun Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Q">Quanyu Dai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yueguo Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03689v1-abstract-short" style="display: inline;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical ch… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03689v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03689v1-abstract-full" style="display: none;"> Music source separation and pitch estimation are two vital tasks in music information retrieval. Typically, the input of pitch estimation is obtained from the output of music source separation. Therefore, existing methods have tried to perform these two tasks simultaneously, so as to leverage the mutually beneficial relationship between both tasks. However, these methods still face two critical challenges that limit the improvement of both tasks: the lack of labeled data and joint learning optimization. To address these challenges, we propose a Model-Agnostic Joint Learning (MAJL) framework for both tasks. MAJL is a generic framework and can use variant models for each task. It includes a two-stage training method and a dynamic weighting method named Dynamic Weights on Hard Samples (DWHS), which addresses the lack of labeled data and joint learning optimization, respectively. Experimental results on public music datasets show that MAJL outperforms state-of-the-art methods on both tasks, with significant improvements of 0.92 in Signal-to-Distortion Ratio (SDR) for music source separation and 2.71% in Raw Pitch Accuracy (RPA) for pitch estimation. Furthermore, comprehensive studies not only validate the effectiveness of each component of MAJL, but also indicate the great generality of MAJL in adapting to different model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03689v1-abstract-full').style.display = 'none'; document.getElementById('2501.03689v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19926">arXiv:2412.19926</a> <span> [<a href="https://arxiv.org/pdf/2412.19926">pdf</a>, <a href="https://arxiv.org/ps/2412.19926">ps</a>, <a href="https://arxiv.org/format/2412.19926">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Right vs. Right: Can LLMs Make Tough Choices? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiaqing Yuan</a>, <a href="/search/cs?searchtype=author&query=Murukannaiah%2C+P+K">Pradeep K. Murukannaiah</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+M+P">Munindar P. Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19926v1-abstract-short" style="display: inline;"> An ethical dilemma describes a choice between two "right" options involving conflicting moral values. We present a comprehensive evaluation of how LLMs navigate ethical dilemmas. Specifically, we investigate LLMs on their (1) sensitivity in comprehending ethical dilemmas, (2) consistency in moral value choice, (3) consideration of consequences, and (4) ability to align their responses to a moral v… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19926v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19926v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19926v1-abstract-full" style="display: none;"> An ethical dilemma describes a choice between two "right" options involving conflicting moral values. We present a comprehensive evaluation of how LLMs navigate ethical dilemmas. Specifically, we investigate LLMs on their (1) sensitivity in comprehending ethical dilemmas, (2) consistency in moral value choice, (3) consideration of consequences, and (4) ability to align their responses to a moral value preference explicitly or implicitly specified in a prompt. Drawing inspiration from a leading ethical framework, we construct a dataset comprising 1,730 ethical dilemmas involving four pairs of conflicting values. We evaluate 20 well-known LLMs from six families. Our experiments reveal that: (1) LLMs exhibit pronounced preferences between major value pairs, and prioritize truth over loyalty, community over individual, and long-term over short-term considerations. (2) The larger LLMs tend to support a deontological perspective, maintaining their choices of actions even when negative consequences are specified. (3) Explicit guidelines are more effective in guiding LLMs' moral choice than in-context examples. Lastly, our experiments highlight the limitation of LLMs in comprehending different formulations of ethical dilemmas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19926v1-abstract-full').style.display = 'none'; document.getElementById('2412.19926v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19437">arXiv:2412.19437</a> <span> [<a href="https://arxiv.org/pdf/2412.19437">pdf</a>, <a href="https://arxiv.org/format/2412.19437">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> DeepSeek-V3 Technical Report </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=DeepSeek-AI"> DeepSeek-AI</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A">Aixin Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+B">Bei Feng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bing Xue</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bochao Wu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Chengda Lu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chenggang Zhao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chengqi Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+C">Chong Ruan</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+D">Damai Dai</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+D">Daya Guo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D">Dejian Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Deli Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Dongjie Ji</a>, <a href="/search/cs?searchtype=author&query=Li%2C+E">Erhang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Fangyun Lin</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+F">Fucong Dai</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+F">Fuli Luo</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+G">Guangbo Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Guanting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guowei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">H. Zhang</a>, <a href="/search/cs?searchtype=author&query=Bao%2C+H">Han Bao</a> , et al. (175 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19437v1-abstract-short" style="display: inline;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for loa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19437v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19437v1-abstract-full" style="display: none;"> We present DeepSeek-V3, a strong Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. To achieve efficient inference and cost-effective training, DeepSeek-V3 adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures, which were thoroughly validated in DeepSeek-V2. Furthermore, DeepSeek-V3 pioneers an auxiliary-loss-free strategy for load balancing and sets a multi-token prediction training objective for stronger performance. We pre-train DeepSeek-V3 on 14.8 trillion diverse and high-quality tokens, followed by Supervised Fine-Tuning and Reinforcement Learning stages to fully harness its capabilities. Comprehensive evaluations reveal that DeepSeek-V3 outperforms other open-source models and achieves performance comparable to leading closed-source models. Despite its excellent performance, DeepSeek-V3 requires only 2.788M H800 GPU hours for its full training. In addition, its training process is remarkably stable. Throughout the entire training process, we did not experience any irrecoverable loss spikes or perform any rollbacks. The model checkpoints are available at https://github.com/deepseek-ai/DeepSeek-V3. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19437v1-abstract-full').style.display = 'none'; document.getElementById('2412.19437v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.18424">arXiv:2412.18424</a> <span> [<a href="https://arxiv.org/pdf/2412.18424">pdf</a>, <a href="https://arxiv.org/format/2412.18424">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LongDocURL: a Comprehensive Multimodal Long Document Benchmark Integrating Understanding, Reasoning, and Locating </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiale Yuan</a>, <a href="/search/cs?searchtype=author&query=Bu%2C+P">Pi Bu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peijie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhong-Zhi Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jian Xu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiao-Hui Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuan Gao</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jun Song</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Bo Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cheng-Lin Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.18424v2-abstract-short" style="display: inline;"> Large vision language models (LVLMs) have improved the document understanding capabilities remarkably, enabling the handling of complex document elements, longer contexts, and a wider range of tasks. However, existing document understanding benchmarks have been limited to handling only a small number of pages and fail to provide a comprehensive analysis of layout elements locating. In this paper,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18424v2-abstract-full').style.display = 'inline'; document.getElementById('2412.18424v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.18424v2-abstract-full" style="display: none;"> Large vision language models (LVLMs) have improved the document understanding capabilities remarkably, enabling the handling of complex document elements, longer contexts, and a wider range of tasks. However, existing document understanding benchmarks have been limited to handling only a small number of pages and fail to provide a comprehensive analysis of layout elements locating. In this paper, we first define three primary task categories: Long Document Understanding, numerical Reasoning, and cross-element Locating, and then propose a comprehensive benchmark, LongDocURL, integrating above three primary tasks and comprising 20 sub-tasks categorized based on different primary tasks and answer evidences. Furthermore, we develop a semi-automated construction pipeline and collect 2,325 high-quality question-answering pairs, covering more than 33,000 pages of documents, significantly outperforming existing benchmarks. Subsequently, we conduct comprehensive evaluation experiments on both open-source and closed-source models across 26 different configurations, revealing critical performance gaps in this field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.18424v2-abstract-full').style.display = 'none'; document.getElementById('2412.18424v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17891">arXiv:2412.17891</a> <span> [<a href="https://arxiv.org/pdf/2412.17891">pdf</a>, <a href="https://arxiv.org/format/2412.17891">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Power of Adaptation: Boosting In-Context Learning through Adaptive Prompting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cai%2C+S">Shuzhang Cai</a>, <a href="/search/cs?searchtype=author&query=Mensah-Boateng%2C+T">Twumasi Mensah-Boateng</a>, <a href="/search/cs?searchtype=author&query=Kuksov%2C+X">Xander Kuksov</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shaojie Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17891v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated exceptional abilities across a broad range of language-related tasks, including generating solutions to complex reasoning problems. An effective technique to enhance LLM performance is in-context learning, which encourages a step-by-step reasoning process by including explanatory examples to guide the model's responses. However, selecting appropriate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17891v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17891v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17891v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated exceptional abilities across a broad range of language-related tasks, including generating solutions to complex reasoning problems. An effective technique to enhance LLM performance is in-context learning, which encourages a step-by-step reasoning process by including explanatory examples to guide the model's responses. However, selecting appropriate exemplars for the model poses a challenge, as each dataset demands a distinct set of exemplars to enable the LLM to learn effectively and perform well on the test set. Current studies often rely on uncertainty- or diversity-based selection strategies to select exemplars for annotation and to improve model learning. However, these studies typically employ a non-adaptive approach, selecting a set of exemplars all at once. We argue that this non-adaptive strategy may result in a set of exemplars with high redundancy in terms of the knowledge covered, ultimately reducing their overall informativeness. To address this limitation, we propose \textsc{Adaptive-Prompt}, a novel method that adaptively selects exemplars by leveraging model feedback from previously chosen exemplars. Experimental results show that \textsc{Adaptive-Prompt} significantly enhances LLM performance across a variety of reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17891v1-abstract-full').style.display = 'none'; document.getElementById('2412.17891v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14922">arXiv:2412.14922</a> <span> [<a href="https://arxiv.org/pdf/2412.14922">pdf</a>, <a href="https://arxiv.org/format/2412.14922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RobustFT: Robust Supervised Fine-tuning for Large Language Models under Noisy Response </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junyu Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+X">Xiao Luo</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kaize Ding</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Z">Zhiping Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Ming Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14922v1-abstract-short" style="display: inline;"> Supervised fine-tuning (SFT) plays a crucial role in adapting large language models (LLMs) to specific domains or tasks. However, as demonstrated by empirical experiments, the collected data inevitably contains noise in practical applications, which poses significant challenges to model performance on downstream tasks. Therefore, there is an urgent need for a noise-robust SFT framework to enhance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14922v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14922v1-abstract-full" style="display: none;"> Supervised fine-tuning (SFT) plays a crucial role in adapting large language models (LLMs) to specific domains or tasks. However, as demonstrated by empirical experiments, the collected data inevitably contains noise in practical applications, which poses significant challenges to model performance on downstream tasks. Therefore, there is an urgent need for a noise-robust SFT framework to enhance model capabilities in downstream tasks. To address this challenge, we introduce a robust SFT framework (RobustFT) that performs noise detection and relabeling on downstream task data. For noise identification, our approach employs a multi-expert collaborative system with inference-enhanced models to achieve superior noise detection. In the denoising phase, we utilize a context-enhanced strategy, which incorporates the most relevant and confident knowledge followed by careful assessment to generate reliable annotations. Additionally, we introduce an effective data selection mechanism based on response entropy, ensuring only high-quality samples are retained for fine-tuning. Extensive experiments conducted on multiple LLMs across five datasets demonstrate RobustFT's exceptional performance in noisy scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14922v1-abstract-full').style.display = 'none'; document.getElementById('2412.14922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11863">arXiv:2412.11863</a> <span> [<a href="https://arxiv.org/pdf/2412.11863">pdf</a>, <a href="https://arxiv.org/format/2412.11863">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> GeoX: Geometric Problem Solving Through Unified Formalized Vision-Language Pre-training </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+R">Renqiu Xia</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingsheng Li</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+H">Hancheng Ye</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wenjie Wu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hongbin Zhou</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tianshuo Peng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyu Cai</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bin Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Junchi Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11863v2-abstract-short" style="display: inline;"> Despite their proficiency in general tasks, Multi-modal Large Language Models (MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands understanding diagrams, interpreting symbols, and performing complex reasoning. This limitation arises from their pre-training on natural images and texts, along with the lack of automated verification in the problem-solving process. Besides, c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11863v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11863v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11863v2-abstract-full" style="display: none;"> Despite their proficiency in general tasks, Multi-modal Large Language Models (MLLMs) struggle with automatic Geometry Problem Solving (GPS), which demands understanding diagrams, interpreting symbols, and performing complex reasoning. This limitation arises from their pre-training on natural images and texts, along with the lack of automated verification in the problem-solving process. Besides, current geometric specialists are limited by their task-specific designs, making them less effective for broader geometric problems. To this end, we present GeoX, a multi-modal large model focusing on geometric understanding and reasoning tasks. Given the significant differences between geometric diagram-symbol and natural image-text, we introduce unimodal pre-training to develop a diagram encoder and symbol decoder, enhancing the understanding of geometric images and corpora. Furthermore, we introduce geometry-language alignment, an effective pre-training paradigm that bridges the modality gap between unimodal geometric experts. We propose a Generator-And-Sampler Transformer (GS-Former) to generate discriminative queries and eliminate uninformative representations from unevenly distributed geometric signals. Finally, GeoX benefits from visual instruction tuning, empowering it to take geometric images and questions as input and generate verifiable solutions. Experiments show that GeoX outperforms both generalists and geometric specialists on publicly recognized benchmarks, such as GeoQA, UniGeo, Geometry3K, and PGPS9k. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11863v2-abstract-full').style.display = 'none'; document.getElementById('2412.11863v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Our code is available at https://github.com/Alpha-Innovator/GeoX</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11325">arXiv:2412.11325</a> <span> [<a href="https://arxiv.org/pdf/2412.11325">pdf</a>, <a href="https://arxiv.org/format/2412.11325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Sonicmesh: Enhancing 3D Human Mesh Reconstruction in Vision-Impaired Environments With Acoustic Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoxuan Liang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wuyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Hong Zhou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhaolong Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Sicheng Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yansong Li</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+R">Rui Yin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiantao Yuan</a>, <a href="/search/cs?searchtype=author&query=Gummeson%2C+J">Jeremy Gummeson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11325v1-abstract-short" style="display: inline;"> 3D Human Mesh Reconstruction (HMR) from 2D RGB images faces challenges in environments with poor lighting, privacy concerns, or occlusions. These weaknesses of RGB imaging can be complemented by acoustic signals, which are widely available, easy to deploy, and capable of penetrating obstacles. However, no existing methods effectively combine acoustic signals with RGB data for robust 3D HMR. The pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11325v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11325v1-abstract-full" style="display: none;"> 3D Human Mesh Reconstruction (HMR) from 2D RGB images faces challenges in environments with poor lighting, privacy concerns, or occlusions. These weaknesses of RGB imaging can be complemented by acoustic signals, which are widely available, easy to deploy, and capable of penetrating obstacles. However, no existing methods effectively combine acoustic signals with RGB data for robust 3D HMR. The primary challenges include the low-resolution images generated by acoustic signals and the lack of dedicated processing backbones. We introduce SonicMesh, a novel approach combining acoustic signals with RGB images to reconstruct 3D human mesh. To address the challenges of low resolution and the absence of dedicated processing backbones in images generated by acoustic signals, we modify an existing method, HRNet, for effective feature extraction. We also integrate a universal feature embedding technique to enhance the precision of cross-dimensional feature alignment, enabling SonicMesh to achieve high accuracy. Experimental results demonstrate that SonicMesh accurately reconstructs 3D human mesh in challenging environments such as occlusions, non-line-of-sight scenarios, and poor lighting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11325v1-abstract-full').style.display = 'none'; document.getElementById('2412.11325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11167">arXiv:2412.11167</a> <span> [<a href="https://arxiv.org/pdf/2412.11167">pdf</a>, <a href="https://arxiv.org/format/2412.11167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Cultural Palette: Pluralising Culture Alignment via Multi-agent Palette </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiahao Yuan</a>, <a href="/search/cs?searchtype=author&query=Di%2C+Z">Zixiang Di</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shangzixin Zhao</a>, <a href="/search/cs?searchtype=author&query=Naseem%2C+U">Usman Naseem</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11167v2-abstract-short" style="display: inline;"> Large language models (LLMs) face challenges in aligning with diverse cultural values despite their remarkable performance in generation, which stems from inherent monocultural biases and difficulties in capturing nuanced cultural semantics. Existing methods struggle to adapt to unkown culture after fine-tuning. Inspired by cultural geography across five continents, we propose Cultural Palette, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11167v2-abstract-full').style.display = 'inline'; document.getElementById('2412.11167v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11167v2-abstract-full" style="display: none;"> Large language models (LLMs) face challenges in aligning with diverse cultural values despite their remarkable performance in generation, which stems from inherent monocultural biases and difficulties in capturing nuanced cultural semantics. Existing methods struggle to adapt to unkown culture after fine-tuning. Inspired by cultural geography across five continents, we propose Cultural Palette, a multi-agent framework that redefines cultural alignment as an adaptive "color-blending" process for country-specific adaptation. Our approach harnesses cultural geography across five continents (Africa, America, Asia, Europe, Oceania) through three key steps: First, we synthesize the Pentachromatic Cultural Palette Dataset using GPT-4o, refining continental-level dialogues with Hofstede cultural dimensions to establish foundational cultural representations. Second, five continent-level alignment agents form specialized cultural communities that generate region-specific draft responses. Third, a Meta Agent employs Cultural MoErges to dynamically blend these cultural "colors" through attention-gated parameter merging, akin to mixing pigments on a palette, resolving conflicts while preserving cultural nuances to produce the final culturally-aligned response. Extensive experiments across various countries demonstrate that Cultural Palette surpasses existing baselines in cultural alignment. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11167v2-abstract-full').style.display = 'none'; document.getElementById('2412.11167v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 15 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10741">arXiv:2412.10741</a> <span> [<a href="https://arxiv.org/pdf/2412.10741">pdf</a>, <a href="https://arxiv.org/format/2412.10741">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> RegMixMatch: Optimizing Mixup Utilization in Semi-Supervised Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+H">Haorong Han</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jidong Yuan</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chixuan Wei</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhongyang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10741v1-abstract-short" style="display: inline;"> Consistency regularization and pseudo-labeling have significantly advanced semi-supervised learning (SSL). Prior works have effectively employed Mixup for consistency regularization in SSL. However, our findings indicate that applying Mixup for consistency regularization may degrade SSL performance by compromising the purity of artificial labels. Moreover, most pseudo-labeling based methods utiliz… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10741v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10741v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10741v1-abstract-full" style="display: none;"> Consistency regularization and pseudo-labeling have significantly advanced semi-supervised learning (SSL). Prior works have effectively employed Mixup for consistency regularization in SSL. However, our findings indicate that applying Mixup for consistency regularization may degrade SSL performance by compromising the purity of artificial labels. Moreover, most pseudo-labeling based methods utilize thresholding strategy to exclude low-confidence data, aiming to mitigate confirmation bias; however, this approach limits the utility of unlabeled samples. To address these challenges, we propose RegMixMatch, a novel framework that optimizes the use of Mixup with both high- and low-confidence samples in SSL. First, we introduce semi-supervised RegMixup, which effectively addresses reduced artificial labels purity by using both mixed samples and clean samples for training. Second, we develop a class-aware Mixup technique that integrates information from the top-2 predicted classes into low-confidence samples and their artificial labels, reducing the confirmation bias associated with these samples and enhancing their effective utilization. Experimental results demonstrate that RegMixMatch achieves state-of-the-art performance across various SSL benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10741v1-abstract-full').style.display = 'none'; document.getElementById('2412.10741v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.10266">arXiv:2412.10266</a> <span> [<a href="https://arxiv.org/pdf/2412.10266">pdf</a>, <a href="https://arxiv.org/format/2412.10266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reasoner Outperforms: Generative Stance Detection with Rationalization for Social Media </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiaqing Yuan</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+R">Ruijie Xi</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+M+P">Munindar P. Singh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.10266v1-abstract-short" style="display: inline;"> Stance detection is crucial for fostering a human-centric Web by analyzing user-generated content to identify biases and harmful narratives that undermine trust. With the development of Large Language Models (LLMs), existing approaches treat stance detection as a classification problem, providing robust methodologies for modeling complex group interactions and advancing capabilities in natural lan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10266v1-abstract-full').style.display = 'inline'; document.getElementById('2412.10266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.10266v1-abstract-full" style="display: none;"> Stance detection is crucial for fostering a human-centric Web by analyzing user-generated content to identify biases and harmful narratives that undermine trust. With the development of Large Language Models (LLMs), existing approaches treat stance detection as a classification problem, providing robust methodologies for modeling complex group interactions and advancing capabilities in natural language tasks. However, these methods often lack interpretability, limiting their ability to offer transparent and understandable justifications for predictions. This study adopts a generative approach, where stance predictions include explicit, interpretable rationales, and integrates them into smaller language models through single-task and multitask learning. We find that incorporating reasoning into stance detection enables the smaller model (FlanT5) to outperform GPT-3.5's zero-shot performance, achieving an improvement of up to 9.57%. Moreover, our results show that reasoning capabilities enhance multitask learning performance but may reduce effectiveness in single-task settings. Crucially, we demonstrate that faithful rationales improve rationale distillation into SLMs, advancing efforts to build interpretable, trustworthy systems for addressing discrimination, fostering trust, and promoting equitable engagement on social media. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.10266v1-abstract-full').style.display = 'none'; document.getElementById('2412.10266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09912">arXiv:2412.09912</a> <span> [<a href="https://arxiv.org/pdf/2412.09912">pdf</a>, <a href="https://arxiv.org/format/2412.09912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> All-in-One: Transferring Vision Foundation Models into Stereo Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Meiya Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yangyang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09912v1-abstract-short" style="display: inline;"> As a fundamental vision task, stereo matching has made remarkable progress. While recent iterative optimization-based methods have achieved promising performance, their feature extraction capabilities still have room for improvement. Inspired by the ability of vision foundation models (VFMs) to extract general representations, in this work, we propose AIO-Stereo which can flexibly select and trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09912v1-abstract-full').style.display = 'inline'; document.getElementById('2412.09912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09912v1-abstract-full" style="display: none;"> As a fundamental vision task, stereo matching has made remarkable progress. While recent iterative optimization-based methods have achieved promising performance, their feature extraction capabilities still have room for improvement. Inspired by the ability of vision foundation models (VFMs) to extract general representations, in this work, we propose AIO-Stereo which can flexibly select and transfer knowledge from multiple heterogeneous VFMs to a single stereo matching model. To better reconcile features between heterogeneous VFMs and the stereo matching model and fully exploit prior knowledge from VFMs, we proposed a dual-level feature utilization mechanism that aligns heterogeneous features and transfers multi-level knowledge. Based on the mechanism, a dual-level selective knowledge transfer module is designed to selectively transfer knowledge and integrate the advantages of multiple VFMs. Experimental results show that AIO-Stereo achieves start-of-the-art performance on multiple datasets and ranks $1^{st}$ on the Middlebury dataset and outperforms all the published work on the ETH3D benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09912v1-abstract-full').style.display = 'none'; document.getElementById('2412.09912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by AAAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06259">arXiv:2412.06259</a> <span> [<a href="https://arxiv.org/pdf/2412.06259">pdf</a>, <a href="https://arxiv.org/format/2412.06259">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Prompt Learning and Pause Encoding for Alzheimer's Disease Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yin-Long Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+R">Rui Feng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jia-Hong Yuan</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhen-Hua Ling</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06259v1-abstract-short" style="display: inline;"> Compared to other clinical screening techniques, speech-and-language-based automated Alzheimer's disease (AD) detection methods are characterized by their non-invasiveness, cost-effectiveness, and convenience. Previous studies have demonstrated the efficacy of fine-tuning pre-trained language models (PLMs) for AD detection. However, the objective of this traditional fine-tuning method, which invol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06259v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06259v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06259v1-abstract-full" style="display: none;"> Compared to other clinical screening techniques, speech-and-language-based automated Alzheimer's disease (AD) detection methods are characterized by their non-invasiveness, cost-effectiveness, and convenience. Previous studies have demonstrated the efficacy of fine-tuning pre-trained language models (PLMs) for AD detection. However, the objective of this traditional fine-tuning method, which involves inputting only transcripts, is inconsistent with the masked language modeling (MLM) task used during the pre-training phase of PLMs. In this paper, we investigate prompt-based fine-tuning of PLMs, converting the classification task into a MLM task by inserting prompt templates into the transcript inputs. We also explore the impact of incorporating pause information from forced alignment into manual transcripts. Additionally, we compare the performance of various automatic speech recognition (ASR) models and select the Whisper model to generate ASR-based transcripts for comparison with manual transcripts. Furthermore, majority voting and ensemble techniques are applied across different PLMs (BERT and RoBERTa) using different random seeds. Ultimately, we obtain maximum detection accuracy of 95.8% (with mean 87.9%, std 3.3%) using manual transcripts, achieving state-of-the-art performance for AD detection using only transcripts on the ADReSS test set. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06259v1-abstract-full').style.display = 'none'; document.getElementById('2412.06259v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ISCSLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05776">arXiv:2412.05776</a> <span> [<a href="https://arxiv.org/pdf/2412.05776">pdf</a>, <a href="https://arxiv.org/format/2412.05776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> </div> </div> <p class="title is-5 mathjax"> ProtGO: A Transformer based Fusion Model for accurately predicting Gene Ontology (GO) Terms from full scale Protein Sequences </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tamir%2C+A">Azwad Tamir</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiann-Shiun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05776v1-abstract-short" style="display: inline;"> Recent developments in next generation sequencing technology have led to the creation of extensive, open-source protein databases consisting of hundreds of millions of sequences. To render these sequences applicable in biomedical applications, they must be meticulously annotated by wet lab testing or extracting them from existing literature. Over the last few years, researchers have developed nume… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05776v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05776v1-abstract-full" style="display: none;"> Recent developments in next generation sequencing technology have led to the creation of extensive, open-source protein databases consisting of hundreds of millions of sequences. To render these sequences applicable in biomedical applications, they must be meticulously annotated by wet lab testing or extracting them from existing literature. Over the last few years, researchers have developed numerous automatic annotation systems, particularly deep learning models based on machine learning and artificial intelligence, to address this issue. In this work, we propose a transformer-based fusion model capable of predicting Gene Ontology (GO) terms from full-scale protein sequences, achieving state-of-the-art accuracy compared to other contemporary machine learning annotation systems. The approach performs particularly well on clustered split datasets, which comprise training and testing samples originating from distinct distributions that are structurally diverse. This demonstrates that the model is able to understand both short and long term dependencies within the enzyme's structure and can precisely identify the motifs associated with the various GO terms. Furthermore, the technique is lightweight and less computationally expensive compared to the benchmark methods, while at the same time not unaffected by sequence length, rendering it appropriate for diverse applications with varying sequence lengths. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05776v1-abstract-full').style.display = 'none'; document.getElementById('2412.05776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 3 figures, 4 tables, This work would be submitted to Scientific journals for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05770">arXiv:2412.05770</a> <span> [<a href="https://arxiv.org/pdf/2412.05770">pdf</a>, <a href="https://arxiv.org/format/2412.05770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> KITE-DDI: A Knowledge graph Integrated Transformer Model for accurately predicting Drug-Drug Interaction Events from Drug SMILES and Biomedical Knowledge Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tamir%2C+A">Azwad Tamir</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiann-Shiun Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05770v1-abstract-short" style="display: inline;"> It is a common practice in modern medicine to prescribe multiple medications simultaneously to treat diseases. However, these medications could have adverse reactions between them, known as Drug-Drug Interactions (DDI), which have the potential to cause significant bodily injury and could even be fatal. Hence, it is essential to identify all the DDI events before prescribing multiple drugs to a pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05770v1-abstract-full').style.display = 'inline'; document.getElementById('2412.05770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05770v1-abstract-full" style="display: none;"> It is a common practice in modern medicine to prescribe multiple medications simultaneously to treat diseases. However, these medications could have adverse reactions between them, known as Drug-Drug Interactions (DDI), which have the potential to cause significant bodily injury and could even be fatal. Hence, it is essential to identify all the DDI events before prescribing multiple drugs to a patient. Most contemporary research for predicting DDI events relies on either information from Biomedical Knowledge graphs (KG) or drug SMILES, with very few managing to merge data from both to make predictions. While others use heuristic algorithms to extract features from SMILES and KGs, which are then fed into a Deep Learning framework to generate output. In this study, we propose a KG-integrated Transformer architecture to generate an end-to-end fully automated Machine Learning pipeline for predicting DDI events with high accuracy. The algorithm takes full-scale molecular SMILES sequences of a pair of drugs and a biomedical KG as input and predicts the interaction between the two drugs with high precision. The results show superior performance in two different benchmark datasets compared to existing state-of-the-art models especially when the test and training sets contain distinct sets of drug molecules. This demonstrates the strong generalization of the proposed model, indicating its potential for DDI event prediction for newly developed drugs. The model does not depend on heuristic models for generating embeddings and has a minimal number of hyperparameters, making it easy to use while demonstrating outstanding performance in low-data scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05770v1-abstract-full').style.display = 'none'; document.getElementById('2412.05770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures, 5 Tables, This work has been submitted to IEEE for possible publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05000">arXiv:2412.05000</a> <span> [<a href="https://arxiv.org/pdf/2412.05000">pdf</a>, <a href="https://arxiv.org/format/2412.05000">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Noise Matters: Diffusion Model-based Urban Mobility Generation with Collaborative Noise Priors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jingtao Ding</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jian Yuan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05000v2-abstract-short" style="display: inline;"> With global urbanization, the focus on sustainable cities has largely grown, driving research into equity, resilience, and urban planning, which often relies on mobility data. The rise of web-based apps and mobile devices has provided valuable user data for mobility-related research. However, real-world mobility data is costly and raises privacy concerns. To protect privacy while retaining key fea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05000v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05000v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05000v2-abstract-full" style="display: none;"> With global urbanization, the focus on sustainable cities has largely grown, driving research into equity, resilience, and urban planning, which often relies on mobility data. The rise of web-based apps and mobile devices has provided valuable user data for mobility-related research. However, real-world mobility data is costly and raises privacy concerns. To protect privacy while retaining key features of real-world movement, the demand for synthetic data has steadily increased. Recent advances in diffusion models have shown great potential for mobility trajectory generation due to their ability to model randomness and uncertainty. However, existing approaches often directly apply identically distributed (i.i.d.) noise sampling from image generation techniques, which fail to account for the spatiotemporal correlations and social interactions that shape urban mobility patterns. In this paper, we propose CoDiffMob, a diffusion model for urban mobility generation with collaborative noise priors, we emphasize the critical role of noise in diffusion models for generating mobility data. By leveraging both individual movement characteristics and population-wide dynamics, we construct novel collaborative noise priors that provide richer and more informative guidance throughout the generation process. Extensive experiments demonstrate the superiority of our method, with generated data accurately capturing both individual preferences and collective patterns, achieving an improvement of over 32%. Furthermore, it can effectively replace web-derived mobility data to better support downstream applications, while safeguarding user privacy and fostering a more secure and ethical web. This highlights its tremendous potential for applications in sustainable city-related research. The code and data are available at https://github.com/tsinghua-fib-lab/CoDiffMob. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05000v2-abstract-full').style.display = 'none'; document.getElementById('2412.05000v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04931">arXiv:2412.04931</a> <span> [<a href="https://arxiv.org/pdf/2412.04931">pdf</a>, <a href="https://arxiv.org/format/2412.04931">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1007/978-3-031-78447-7_16">10.1007/978-3-031-78447-7_16 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> DEYOLO: Dual-Feature-Enhancement YOLO for Cross-Modality Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yishuo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boran Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xinyu Guo</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenbin Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jiasheng He</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaobin Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jing Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04931v1-abstract-short" style="display: inline;"> Object detection in poor-illumination environments is a challenging task as objects are usually not clearly visible in RGB images. As infrared images provide additional clear edge information that complements RGB images, fusing RGB and infrared images has potential to enhance the detection ability in poor-illumination environments. However, existing works involving both visible and infrared images… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04931v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04931v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04931v1-abstract-full" style="display: none;"> Object detection in poor-illumination environments is a challenging task as objects are usually not clearly visible in RGB images. As infrared images provide additional clear edge information that complements RGB images, fusing RGB and infrared images has potential to enhance the detection ability in poor-illumination environments. However, existing works involving both visible and infrared images only focus on image fusion, instead of object detection. Moreover, they directly fuse the two kinds of image modalities, which ignores the mutual interference between them. To fuse the two modalities to maximize the advantages of cross-modality, we design a dual-enhancement-based cross-modality object detection network DEYOLO, in which semantic-spatial cross modality and novel bi-directional decoupled focus modules are designed to achieve the detection-centered mutual enhancement of RGB-infrared (RGB-IR). Specifically, a dual semantic enhancing channel weight assignment module (DECA) and a dual spatial enhancing pixel weight assignment module (DEPA) are firstly proposed to aggregate cross-modality information in the feature space to improve the feature representation ability, such that feature fusion can aim at the object detection task. Meanwhile, a dual-enhancement mechanism, including enhancements for two-modality fusion and single modality, is designed in both DECAand DEPAto reduce interference between the two kinds of image modalities. Then, a novel bi-directional decoupled focus is developed to enlarge the receptive field of the backbone network in different directions, which improves the representation quality of DEYOLO. Extensive experiments on M3FD and LLVIP show that our approach outperforms SOTA object detection algorithms by a clear margin. Our code is available at https://github.com/chips96/DEYOLO. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04931v1-abstract-full').style.display = 'none'; document.getElementById('2412.04931v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> The International Conference on Pattern Recognition (ICPR),2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03603">arXiv:2412.03603</a> <span> [<a href="https://arxiv.org/pdf/2412.03603">pdf</a>, <a href="https://arxiv.org/format/2412.03603">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HunyuanVideo: A Systematic Framework For Large Video Generative Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kong%2C+W">Weijie Kong</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qi Tian</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zijian Zhang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+R">Rox Min</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zuozhuo Dai</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jin Zhou</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+J">Jiangfeng Xiong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bo Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+K">Kathrina Wu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Q">Qin Lin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Junkun Yuan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yanxin Long</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Aladdin Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+A">Andong Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changlin Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+D">Duojun Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+F">Fang Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hao Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongmei Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jacob Song</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+J">Jiawang Bai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jianbing Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+J">Jinbao Xue</a> , et al. (27 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03603v4-abstract-short" style="display: inline;"> Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates per… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03603v4-abstract-full').style.display = 'inline'; document.getElementById('2412.03603v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03603v4-abstract-full" style="display: none;"> Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at https://github.com/Tencent/HunyuanVideo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03603v4-abstract-full').style.display = 'none'; document.getElementById('2412.03603v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01770">arXiv:2412.01770</a> <span> [<a href="https://arxiv.org/pdf/2412.01770">pdf</a>, <a href="https://arxiv.org/format/2412.01770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Robot Learning with Super-Linear Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Torne%2C+M">Marcel Torne</a>, <a href="/search/cs?searchtype=author&query=Jain%2C+A">Arhan Jain</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiayi Yuan</a>, <a href="/search/cs?searchtype=author&query=Macha%2C+V">Vidaaranya Macha</a>, <a href="/search/cs?searchtype=author&query=Ankile%2C+L">Lars Ankile</a>, <a href="/search/cs?searchtype=author&query=Simeonov%2C+A">Anthony Simeonov</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+P">Pulkit Agrawal</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Abhishek Gupta</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01770v2-abstract-short" style="display: inline;"> Scaling robot learning requires data collection pipelines that scale favorably with human effort. In this work, we propose Crowdsourcing and Amortizing Human Effort for Real-to-Sim-to-Real(CASHER), a pipeline for scaling up data collection and learning in simulation where the performance scales superlinearly with human effort. The key idea is to crowdsource digital twins of real-world scenes using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01770v2-abstract-full').style.display = 'inline'; document.getElementById('2412.01770v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01770v2-abstract-full" style="display: none;"> Scaling robot learning requires data collection pipelines that scale favorably with human effort. In this work, we propose Crowdsourcing and Amortizing Human Effort for Real-to-Sim-to-Real(CASHER), a pipeline for scaling up data collection and learning in simulation where the performance scales superlinearly with human effort. The key idea is to crowdsource digital twins of real-world scenes using 3D reconstruction and collect large-scale data in simulation, rather than the real-world. Data collection in simulation is initially driven by RL, bootstrapped with human demonstrations. As the training of a generalist policy progresses across environments, its generalization capabilities can be used to replace human effort with model generated demonstrations. This results in a pipeline where behavioral data is collected in simulation with continually reducing human effort. We show that CASHER demonstrates zero-shot and few-shot scaling laws on three real-world tasks across diverse scenarios. We show that CASHER enables fine-tuning of pre-trained policies to a target scenario using a video scan without any additional human effort. See our project website: https://casher-robot-learning.github.io/CASHER/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01770v2-abstract-full').style.display = 'none'; document.getElementById('2412.01770v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.01291">arXiv:2412.01291</a> <span> [<a href="https://arxiv.org/pdf/2412.01291">pdf</a>, <a href="https://arxiv.org/format/2412.01291">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> </div> </div> <p class="title is-5 mathjax"> Global Estimation of Building-Integrated Facade and Rooftop Photovoltaic Potential by Integrating 3D Building Footprint and Spatio-Temporal Datasets </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qing Yu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+K">Kechuan Dong</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhiling Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxing Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+H">Hongjun Tan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yanxiu Jin</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jian Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haoran Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junwei Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">Jinyue Yan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.01291v1-abstract-short" style="display: inline;"> This research tackles the challenges of estimating Building-Integrated Photovoltaics (BIPV) potential across various temporal and spatial scales, accounting for different geographical climates and urban morphology. We introduce a holistic methodology for evaluating BIPV potential, integrating 3D building footprint models with diverse meteorological data sources to account for dynamic shadow effect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01291v1-abstract-full').style.display = 'inline'; document.getElementById('2412.01291v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.01291v1-abstract-full" style="display: none;"> This research tackles the challenges of estimating Building-Integrated Photovoltaics (BIPV) potential across various temporal and spatial scales, accounting for different geographical climates and urban morphology. We introduce a holistic methodology for evaluating BIPV potential, integrating 3D building footprint models with diverse meteorological data sources to account for dynamic shadow effects. The approach enables the assessment of PV potential on facades and rooftops at different levels-individual buildings, urban blocks, and cities globally. Through an analysis of 120 typical cities, we highlight the importance of 3D building forms, cityscape morphology, and geographic positioning in measuring BIPV potential at various levels. In particular, our simulation study reveals that among cities with optimal facade PV performance, the average ratio of facade PV potential to rooftop PV potential is approximately 68.2%. Additionally, approximately 17.5% of the analyzed samples demonstrate even higher facade PV potentials compared to rooftop installations. This finding underscores the strategic value of incorporating facade PV applications into urban sustainable energy systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.01291v1-abstract-full').style.display = 'none'; document.getElementById('2412.01291v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00535">arXiv:2412.00535</a> <span> [<a href="https://arxiv.org/pdf/2412.00535">pdf</a>, <a href="https://arxiv.org/format/2412.00535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> FullStack Bench: Evaluating LLMs as Full Stack Coders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bytedance-Seed-Foundation-Code-Team"> Bytedance-Seed-Foundation-Code-Team</a>, <a href="/search/cs?searchtype=author&query=%3A"> :</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yao Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfeng Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyu Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wentao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhengyu Chen</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+S">Shijie Geng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+A">Aoyan Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bo Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Bowen Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Linyi Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Boyi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jerry Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kaibo Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shukai Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Siyao Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianyi Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tingkai Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yongfei Liu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+R">Rui Long</a>, <a href="/search/cs?searchtype=author&query=Mai%2C+J">Jing Mai</a> , et al. (31 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00535v5-abstract-short" style="display: inline;"> As the capabilities of code large language models (LLMs) continue to expand, their applications across diverse code intelligence domains are rapidly increasing. However, most existing datasets only evaluate limited application domains. To address this gap, we have developed a comprehensive code evaluation dataset FullStack Bench focusing on full-stack programming, which encompasses a wide range of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00535v5-abstract-full').style.display = 'inline'; document.getElementById('2412.00535v5-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00535v5-abstract-full" style="display: none;"> As the capabilities of code large language models (LLMs) continue to expand, their applications across diverse code intelligence domains are rapidly increasing. However, most existing datasets only evaluate limited application domains. To address this gap, we have developed a comprehensive code evaluation dataset FullStack Bench focusing on full-stack programming, which encompasses a wide range of application domains (e.g., basic programming, data analysis, software engineering, mathematics, and machine learning). Besides, to assess multilingual programming capabilities, in FullStack Bench, we design real-world instructions and corresponding unit test cases from 16 widely-used programming languages to reflect real-world usage scenarios rather than simple translations. Moreover, we also release an effective code sandbox execution tool (i.e., SandboxFusion) supporting various programming languages and packages to evaluate the performance of our FullStack Bench efficiently. Comprehensive experimental results on our FullStack Bench demonstrate the necessity and effectiveness of our FullStack Bench and SandboxFusion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00535v5-abstract-full').style.display = 'none'; document.getElementById('2412.00535v5-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00058">arXiv:2412.00058</a> <span> [<a href="https://arxiv.org/pdf/2412.00058">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-time volumetric free-hand ultrasound imaging for large-sized organs: A study of imaging the whole spine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+C">Caozhe Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+E">Enxiang Shen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuxin Wang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jie Yuan</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+L">Li Gong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Di Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weijing Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhibin Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00058v1-abstract-short" style="display: inline;"> Three-dimensional (3D) ultrasound imaging can overcome the limitations of conventional two dimensional (2D) ultrasound imaging in structural observation and measurement. However, conducting volumetric ultrasound imaging for large-sized organs still faces difficulties including long acquisition time, inevitable patient movement, and 3D feature recognition. In this study, we proposed a real-time vol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00058v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00058v1-abstract-full" style="display: none;"> Three-dimensional (3D) ultrasound imaging can overcome the limitations of conventional two dimensional (2D) ultrasound imaging in structural observation and measurement. However, conducting volumetric ultrasound imaging for large-sized organs still faces difficulties including long acquisition time, inevitable patient movement, and 3D feature recognition. In this study, we proposed a real-time volumetric free-hand ultrasound imaging system optimized for the above issues and applied it to the clinical diagnosis of scoliosis. This study employed an incremental imaging method coupled with algorithmic acceleration to enable real-time processing and visualization of the large amounts of data generated when scanning large-sized organs. Furthermore, to deal with the difficulty of image feature recognition, we proposed two tissue segmentation algorithms to reconstruct and visualize the spinal anatomy in 3D space by approximating the depth at which the bone structures are located and segmenting the ultrasound images at different depths. We validated the adaptability of our system by deploying it to multiple models of ultra-sound equipment and conducting experiments using different types of ultrasound probes. We also conducted experiments on 6 scoliosis patients and 10 normal volunteers to evaluate the performance of our proposed method. Ultrasound imaging of a volunteer spine from shoulder to crotch (more than 500 mm) was performed in 2 minutes, and the 3D imaging results displayed in real-time were compared with the corresponding X-ray images with a correlation coefficient of 0.96 in spinal curvature. Our proposed volumetric ultrasound imaging system might hold the potential to be clinically applied to other large-sized organs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00058v1-abstract-full').style.display = 'none'; document.getElementById('2412.00058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19143">arXiv:2411.19143</a> <span> [<a href="https://arxiv.org/pdf/2411.19143">pdf</a>, <a href="https://arxiv.org/format/2411.19143">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Co-Learning: Towards Semi-Supervised Object Detection with Road-side Cameras </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jicheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Le-Tuan%2C+A">Anh Le-Tuan</a>, <a href="/search/cs?searchtype=author&query=Ganbarov%2C+A">Ali Ganbarov</a>, <a href="/search/cs?searchtype=author&query=Hauswirth%2C+M">Manfred Hauswirth</a>, <a href="/search/cs?searchtype=author&query=Le-Phuoc%2C+D">Danh Le-Phuoc</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19143v1-abstract-short" style="display: inline;"> Recently, deep learning has experienced rapid expansion, contributing significantly to the progress of supervised learning methodologies. However, acquiring labeled data in real-world settings can be costly, labor-intensive, and sometimes scarce. This challenge inhibits the extensive use of neural networks for practical tasks due to the impractical nature of labeling vast datasets for every indivi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19143v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19143v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19143v1-abstract-full" style="display: none;"> Recently, deep learning has experienced rapid expansion, contributing significantly to the progress of supervised learning methodologies. However, acquiring labeled data in real-world settings can be costly, labor-intensive, and sometimes scarce. This challenge inhibits the extensive use of neural networks for practical tasks due to the impractical nature of labeling vast datasets for every individual application. To tackle this, semi-supervised learning (SSL) offers a promising solution by using both labeled and unlabeled data to train object detectors, potentially enhancing detection efficacy and reducing annotation costs. Nevertheless, SSL faces several challenges, including pseudo-target inconsistencies, disharmony between classification and regression tasks, and efficient use of abundant unlabeled data, especially on edge devices, such as roadside cameras. Thus, we developed a teacher-student-based SSL framework, Co-Learning, which employs mutual learning and annotation-alignment strategies to adeptly navigate these complexities and achieves comparable performance as fully-supervised solutions using 10\% labeled data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19143v1-abstract-full').style.display = 'none'; document.getElementById('2411.19143v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at EAmSI24: Edge AI meets swarm intelligence</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15721">arXiv:2411.15721</a> <span> [<a href="https://arxiv.org/pdf/2411.15721">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Research on Effectiveness Evaluation and Optimization of Baseball Teaching Method Based on Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shaoxuan Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jingao Yuan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuelin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15721v1-abstract-short" style="display: inline;"> In modern physical education, data-driven evaluation methods have gradually attracted attention, especially the quantitative prediction of students' sports performance through machine learning model. The purpose of this study is to use a variety of machine learning models to regress and predict students' comprehensive scores in baseball training, so as to evaluate the effectiveness of the current… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15721v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15721v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15721v1-abstract-full" style="display: none;"> In modern physical education, data-driven evaluation methods have gradually attracted attention, especially the quantitative prediction of students' sports performance through machine learning model. The purpose of this study is to use a variety of machine learning models to regress and predict students' comprehensive scores in baseball training, so as to evaluate the effectiveness of the current baseball teaching methods and put forward targeted training optimization suggestions. We set up a model and evaluate the performance of students by collecting many characteristics, such as hitting times, running times and batting. The experimental results show that K-Neighbors Regressor and Gradient Boosting Regressor are excellent in comprehensive prediction accuracy and stability, and the R score and error index are significantly better than other models. In addition, through the analysis of feature importance, it is found that cumulative hits and cumulative runs are the key factors affecting students' comprehensive scores. Based on the results of this study, this paper puts forward some suggestions on optimizing training strategies to help students get better performance in baseball training. The results show that the data-driven teaching evaluation method can effectively support physical education and promote personalized and refined teaching plan design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15721v1-abstract-full').style.display = 'none'; document.getElementById('2411.15721v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15529">arXiv:2411.15529</a> <span> [<a href="https://arxiv.org/pdf/2411.15529">pdf</a>, <a href="https://arxiv.org/format/2411.15529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> Uplink Multiple Access with Heterogeneous Blocklength and Reliability Constraints: Discrete Signaling with Treating Interference as Noise </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Min Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yu-Chih Huang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jinhong Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15529v1-abstract-short" style="display: inline;"> We consider the uplink multiple access of heterogeneous users, e.g., ultra-reliable low-latency communications (URLLC) and enhanced mobile broadband (eMBB) users. Each user has its own reliability requirement and blocklength constraint, and users transmitting longer blocks suffer from heterogeneous interference. On top of that, the decoding of URLLC messages cannot leverage successive interference… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15529v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15529v1-abstract-full" style="display: none;"> We consider the uplink multiple access of heterogeneous users, e.g., ultra-reliable low-latency communications (URLLC) and enhanced mobile broadband (eMBB) users. Each user has its own reliability requirement and blocklength constraint, and users transmitting longer blocks suffer from heterogeneous interference. On top of that, the decoding of URLLC messages cannot leverage successive interference cancellation (SIC) owing to the stringent latency requirements. This can significantly degrade the spectral efficiency of all URLLC users when the interference is strong. To overcome this issue, we propose a new multiple access scheme employing discrete signaling and treating interference as noise (TIN) decoding, i.e., without SIC. Specifically, to handle heterogeneous interference while maintaining the single-user encoding and decoding complexities, each user uses a single channel code and maps its coded bits onto sub-blocks of symbols, where the underlying constellations can be different. We demonstrate theoretically and numerically that the proposed scheme employing quadrature amplitude modulations and TIN decoding can perform very close to the benchmark scheme based on Gaussian signaling with perfect SIC decoding. Interestingly, we show that the proposed scheme does not need to use all the transmit power budget, but also can sometimes even outperform the benchmark scheme. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15529v1-abstract-full').style.display = 'none'; document.getElementById('2411.15529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 7 figures, accepted by IEEE Transactions on Communications. arXiv admin note: text overlap with arXiv:2308.08883</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14487">arXiv:2411.14487</a> <span> [<a href="https://arxiv.org/pdf/2411.14487">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Ensuring Safety and Trust: Analyzing the Risks of Large Language Models in Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yifan Yang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qiao Jin</a>, <a href="/search/cs?searchtype=author&query=Leaman%2C+R">Robert Leaman</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyu Liu</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Guangzhi Xiong</a>, <a href="/search/cs?searchtype=author&query=Sarfo-Gyamfi%2C+M">Maame Sarfo-Gyamfi</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+C">Changlin Gong</a>, <a href="/search/cs?searchtype=author&query=Ferri%C3%A8re-Steinert%2C+S">Santiago Ferri猫re-Steinert</a>, <a href="/search/cs?searchtype=author&query=Wilbur%2C+W+J">W. John Wilbur</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaojun Li</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiaxin Yuan</a>, <a href="/search/cs?searchtype=author&query=An%2C+B">Bang An</a>, <a href="/search/cs?searchtype=author&query=Castro%2C+K+S">Kelvin S. Castro</a>, <a href="/search/cs?searchtype=author&query=%C3%81lvarez%2C+F+E">Francisco Erramuspe 脕lvarez</a>, <a href="/search/cs?searchtype=author&query=Stockle%2C+M">Mat铆as Stockle</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aidong Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+F">Furong Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiyong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14487v1-abstract-short" style="display: inline;"> The remarkable capabilities of Large Language Models (LLMs) make them increasingly compelling for adoption in real-world healthcare applications. However, the risks associated with using LLMs in medical applications have not been systematically characterized. We propose using five key principles for safe and trustworthy medical AI: Truthfulness, Resilience, Fairness, Robustness, and Privacy, along… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14487v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14487v1-abstract-full" style="display: none;"> The remarkable capabilities of Large Language Models (LLMs) make them increasingly compelling for adoption in real-world healthcare applications. However, the risks associated with using LLMs in medical applications have not been systematically characterized. We propose using five key principles for safe and trustworthy medical AI: Truthfulness, Resilience, Fairness, Robustness, and Privacy, along with ten specific aspects. Under this comprehensive framework, we introduce a novel MedGuard benchmark with 1,000 expert-verified questions. Our evaluation of 11 commonly used LLMs shows that the current language models, regardless of their safety alignment mechanisms, generally perform poorly on most of our benchmarks, particularly when compared to the high performance of human physicians. Despite recent reports indicate that advanced LLMs like ChatGPT can match or even exceed human performance in various medical tasks, this study underscores a significant safety gap, highlighting the crucial need for human oversight and the implementation of AI safety guardrails. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14487v1-abstract-full').style.display = 'none'; document.getElementById('2411.14487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yuan%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>