Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 458 results for author: <span class="mathjax">Tan, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Tan%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Tan, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Tan%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Tan, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05698">arXiv:2502.05698</a> <span> [<a href="https://arxiv.org/pdf/2502.05698">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> A Conceptual Exploration of Generative AI-Induced Cognitive Dissonance and its Emergence in University-Level Academic Writing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seran%2C+C+E">Carl Errol Seran</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Karim%2C+H+A">Hezerul Abdul Karim</a>, <a href="/search/cs?searchtype=author&query=AlDahoul%2C+N">Nouar AlDahoul</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05698v1-abstract-short" style="display: inline;"> The integration of Generative Artificial Intelligence (GenAI) into university-level academic writing presents both opportunities and challenges, particularly in relation to cognitive dissonance (CD). This work explores how GenAI serves as both a trigger and amplifier of CD, as students navigate ethical concerns, academic integrity, and self-efficacy in their writing practices. By synthesizing empi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05698v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05698v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05698v1-abstract-full" style="display: none;"> The integration of Generative Artificial Intelligence (GenAI) into university-level academic writing presents both opportunities and challenges, particularly in relation to cognitive dissonance (CD). This work explores how GenAI serves as both a trigger and amplifier of CD, as students navigate ethical concerns, academic integrity, and self-efficacy in their writing practices. By synthesizing empirical evidence and theoretical insights, we introduce a hypothetical construct of GenAI-induced CD, illustrating the psychological tension between AI-driven efficiency and the principles of originality, effort, and intellectual ownership. We further discuss strategies to mitigate this dissonance, including reflective pedagogy, AI literacy programs, transparency in GenAI use, and discipline-specific task redesigns. These approaches reinforce critical engagement with AI, fostering a balanced perspective that integrates technological advancements while safeguarding human creativity and learning. Our findings contribute to ongoing discussions on AI in education, self-regulated learning, and ethical AI use, offering a conceptual framework for institutions to develop guidelines that align AI adoption with academic values. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05698v1-abstract-full').style.display = 'none'; document.getElementById('2502.05698v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05562">arXiv:2502.05562</a> <span> [<a href="https://arxiv.org/pdf/2502.05562">pdf</a>, <a href="https://arxiv.org/format/2502.05562">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Can Large Language Models Be Query Optimizer for Relational Databases? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kangfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+J+X">Jeffrey Xu Yu</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+C">Chengzhi Piao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+H">Helen Meng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Deli Zhao</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05562v1-abstract-short" style="display: inline;"> Query optimization, which finds the optimized execution plan for a given query, is a complex planning and decision-making problem within the exponentially growing plan space in database management systems (DBMS). Traditional optimizers heavily rely on a certain cost model constructed by various heuristics and empirical tuning, probably leading to generating suboptimal plans. Recent developments of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05562v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05562v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05562v1-abstract-full" style="display: none;"> Query optimization, which finds the optimized execution plan for a given query, is a complex planning and decision-making problem within the exponentially growing plan space in database management systems (DBMS). Traditional optimizers heavily rely on a certain cost model constructed by various heuristics and empirical tuning, probably leading to generating suboptimal plans. Recent developments of Large Language Models (LLMs) have demonstrated their potential in solving complex planning and decision-making problems, such as arithmetic and programmatic tasks. In this paper, we try to explore the potential of LLMs in handling query optimization and propose a tentative LLM-based query optimizer dubbed LLM-QO, established on PostgreSQL's execution engine. In LLM-QO, we formulate query optimization in an autoregressive fashion which directly generates the execution plan without explicit plan enumeration. To investigate the essential input of LLM-QO, we design a customized data recipe named QInstruct to collect the training data from various optimizers and serialize the database's meta data, queries and corresponding plans into a textual format. Based on QInstruct, we implement a two-stage fine-tuning pipeline, Query Instruction Tuning (QIT) and Query Direct Preference Optimization (QDPO), to empower the capability of general-purpose LLMs in handling query optimization. In our experiments, LLM-QO can generate valid and high-quality plans and consistently outperforms both traditional and learned optimizers on three query workloads. Our findings verify that LLMs can be derived as query optimizers where generalization, efficiency and adaptivity deserve further research efforts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05562v1-abstract-full').style.display = 'none'; document.getElementById('2502.05562v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16813">arXiv:2501.16813</a> <span> [<a href="https://arxiv.org/pdf/2501.16813">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Magic Elevating Depression Detection with a Fusion of Text and Audio Intelligence </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gan%2C+L">Lindy Gan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yifan Huang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xiaoyang Gao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiaming Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+F">Fujun Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16813v2-abstract-short" style="display: inline;"> This study proposes an innovative multimodal fusion model based on a teacher-student architecture to enhance the accuracy of depression classification. Our designed model addresses the limitations of traditional methods in feature fusion and modality weight allocation by introducing multi-head attention mechanisms and weighted multimodal transfer learning. Leveraging the DAIC-WOZ dataset, the stud… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16813v2-abstract-full').style.display = 'inline'; document.getElementById('2501.16813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16813v2-abstract-full" style="display: none;"> This study proposes an innovative multimodal fusion model based on a teacher-student architecture to enhance the accuracy of depression classification. Our designed model addresses the limitations of traditional methods in feature fusion and modality weight allocation by introducing multi-head attention mechanisms and weighted multimodal transfer learning. Leveraging the DAIC-WOZ dataset, the student fusion model, guided by textual and auditory teacher models, achieves significant improvements in classification accuracy. Ablation experiments demonstrate that the proposed model attains an F1 score of 99. 1% on the test set, significantly outperforming unimodal and conventional approaches. Our method effectively captures the complementarity between textual and audio features while dynamically adjusting the contributions of the teacher models to enhance generalization capabilities. The experimental results highlight the robustness and adaptability of the proposed framework in handling complex multimodal data. This research provides a novel technical framework for multimodal large model learning in depression analysis, offering new insights into addressing the limitations of existing methods in modality fusion and feature extraction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16813v2-abstract-full').style.display = 'none'; document.getElementById('2501.16813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">21 pages,7 figures.1 table</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14249">arXiv:2501.14249</a> <span> [<a href="https://arxiv.org/pdf/2501.14249">pdf</a>, <a href="https://arxiv.org/format/2501.14249">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Humanity's Last Exam </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Phan%2C+L">Long Phan</a>, <a href="/search/cs?searchtype=author&query=Gatti%2C+A">Alice Gatti</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Z">Ziwen Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nathaniel Li</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Josephina Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hugh Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C+B+C">Chen Bo Calvin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shaaban%2C+M">Mohamed Shaaban</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+J">John Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+S">Sean Shi</a>, <a href="/search/cs?searchtype=author&query=Choi%2C+M">Michael Choi</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+A">Anish Agrawal</a>, <a href="/search/cs?searchtype=author&query=Chopra%2C+A">Arnav Chopra</a>, <a href="/search/cs?searchtype=author&query=Khoja%2C+A">Adam Khoja</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+R">Ryan Kim</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+R">Richard Ren</a>, <a href="/search/cs?searchtype=author&query=Hausenloy%2C+J">Jason Hausenloy</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+O">Oliver Zhang</a>, <a href="/search/cs?searchtype=author&query=Mazeika%2C+M">Mantas Mazeika</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Tung Nguyen</a>, <a href="/search/cs?searchtype=author&query=Anderson%2C+D">Daron Anderson</a>, <a href="/search/cs?searchtype=author&query=Shah%2C+I+A">Imad Ali Shah</a>, <a href="/search/cs?searchtype=author&query=Doroshenko%2C+M">Mikhail Doroshenko</a>, <a href="/search/cs?searchtype=author&query=Stokes%2C+A+C">Alun Cennyth Stokes</a>, <a href="/search/cs?searchtype=author&query=Mahmood%2C+M">Mobeen Mahmood</a> , et al. (710 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14249v3-abstract-short" style="display: inline;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'inline'; document.getElementById('2501.14249v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14249v3-abstract-full" style="display: none;"> Benchmarks are important tools for tracking the rapid advancements in large language model (LLM) capabilities. However, benchmarks are not keeping pace in difficulty: LLMs now achieve over 90\% accuracy on popular benchmarks like MMLU, limiting informed measurement of state-of-the-art LLM capabilities. In response, we introduce Humanity's Last Exam (HLE), a multi-modal benchmark at the frontier of human knowledge, designed to be the final closed-ended academic benchmark of its kind with broad subject coverage. HLE consists of 3,000 questions across dozens of subjects, including mathematics, humanities, and the natural sciences. HLE is developed globally by subject-matter experts and consists of multiple-choice and short-answer questions suitable for automated grading. Each question has a known solution that is unambiguous and easily verifiable, but cannot be quickly answered via internet retrieval. State-of-the-art LLMs demonstrate low accuracy and calibration on HLE, highlighting a significant gap between current LLM capabilities and the expert human frontier on closed-ended academic questions. To inform research and policymaking upon a clear understanding of model capabilities, we publicly release HLE at https://lastexam.ai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14249v3-abstract-full').style.display = 'none'; document.getElementById('2501.14249v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">26 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.08497">arXiv:2501.08497</a> <span> [<a href="https://arxiv.org/pdf/2501.08497">pdf</a>, <a href="https://arxiv.org/format/2501.08497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Addressing Intersectionality, Explainability, and Ethics in AI-Driven Diagnostics: A Rebuttal and Call for Transdiciplinary Action </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Benos%2C+P+V">Panayiotis V. Benos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.08497v1-abstract-short" style="display: inline;"> The increasing integration of artificial intelligence (AI) into medical diagnostics necessitates a critical examination of its ethical and practical implications. While the prioritization of diagnostic accuracy, as advocated by Sabuncu et al. (2025), is essential, this approach risks oversimplifying complex socio-ethical issues, including fairness, privacy, and intersectionality. This rebuttal emp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08497v1-abstract-full').style.display = 'inline'; document.getElementById('2501.08497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.08497v1-abstract-full" style="display: none;"> The increasing integration of artificial intelligence (AI) into medical diagnostics necessitates a critical examination of its ethical and practical implications. While the prioritization of diagnostic accuracy, as advocated by Sabuncu et al. (2025), is essential, this approach risks oversimplifying complex socio-ethical issues, including fairness, privacy, and intersectionality. This rebuttal emphasizes the dangers of reducing multifaceted health disparities to quantifiable metrics and advocates for a more transdisciplinary approach. By incorporating insights from social sciences, ethics, and public health, AI systems can address the compounded effects of intersecting identities and safeguard sensitive data. Additionally, explainability and interpretability must be central to AI design, fostering trust and accountability. This paper calls for a framework that balances accuracy with fairness, privacy, and inclusivity to ensure AI-driven diagnostics serve diverse populations equitably and ethically. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.08497v1-abstract-full').style.display = 'none'; document.getElementById('2501.08497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 1 figure; working paper</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07166">arXiv:2501.07166</a> <span> [<a href="https://arxiv.org/pdf/2501.07166">pdf</a>, <a href="https://arxiv.org/format/2501.07166">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3627673.3679529">10.1145/3627673.3679529 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Natural Language-Assisted Multi-modal Medication Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+K">Kangfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+T">Tian Bian</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Tingyang Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Junzhou Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+H">Helen Meng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07166v1-abstract-short" style="display: inline;"> Combinatorial medication recommendation(CMR) is a fundamental task of healthcare, which offers opportunities for clinical physicians to provide more precise prescriptions for patients with intricate health conditions, particularly in the scenarios of long-term medical care. Previous research efforts have sought to extract meaningful information from electronic health records (EHRs) to facilitate c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07166v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07166v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07166v1-abstract-full" style="display: none;"> Combinatorial medication recommendation(CMR) is a fundamental task of healthcare, which offers opportunities for clinical physicians to provide more precise prescriptions for patients with intricate health conditions, particularly in the scenarios of long-term medical care. Previous research efforts have sought to extract meaningful information from electronic health records (EHRs) to facilitate combinatorial medication recommendations. Existing learning-based approaches further consider the chemical structures of medications, but ignore the textual medication descriptions in which the functionalities are clearly described. Furthermore, the textual knowledge derived from the EHRs of patients remains largely underutilized. To address these issues, we introduce the Natural Language-Assisted Multi-modal Medication Recommendation(NLA-MMR), a multi-modal alignment framework designed to learn knowledge from the patient view and medication view jointly. Specifically, NLA-MMR formulates CMR as an alignment problem from patient and medication modalities. In this vein, we employ pretrained language models(PLMs) to extract in-domain knowledge regarding patients and medications, serving as the foundational representation for both modalities. In the medication modality, we exploit both chemical structures and textual descriptions to create medication representations. In the patient modality, we generate the patient representations based on textual descriptions of diagnosis, procedure, and symptom. Extensive experiments conducted on three publicly accessible datasets demonstrate that NLA-MMR achieves new state-of-the-art performance, with a notable average improvement of 4.72% in Jaccard score. Our source code is publicly available on https://github.com/jtan1102/NLA-MMR_CIKM_2024. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07166v1-abstract-full').style.display = 'none'; document.getElementById('2501.07166v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of the 33rd ACM International Conference on Information and Knowledge Management, Boise, ID, USA, 2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06807">arXiv:2501.06807</a> <span> [<a href="https://arxiv.org/pdf/2501.06807">pdf</a>, <a href="https://arxiv.org/format/2501.06807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> MPCache: MPC-Friendly KV Cache Eviction for Efficient Private Large Language Model Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+W">Wenxuan Zeng</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Ye Dong</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jinjin Zhou</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Junming Ma</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Runsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Meng Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06807v1-abstract-short" style="display: inline;"> Private large language model (LLM) inference based on secure multi-party computation (MPC) offers cryptographically-secure protection for both user prompt and proprietary model weights. However, it suffers from large latency overhead especially for long input sequences. While key-value (KV) cache eviction algorithms have been proposed to reduce the computation and memory cost for plaintext inferen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06807v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06807v1-abstract-full" style="display: none;"> Private large language model (LLM) inference based on secure multi-party computation (MPC) offers cryptographically-secure protection for both user prompt and proprietary model weights. However, it suffers from large latency overhead especially for long input sequences. While key-value (KV) cache eviction algorithms have been proposed to reduce the computation and memory cost for plaintext inference, they are not designed for MPC and cannot benefit private inference easily. In this paper, we propose an accurate and MPC-friendly KV cache eviction framework, dubbed MPCache. MPCache is built on the observation that historical tokens in a long sequence may have different effects on the downstream decoding. Hence, MPCache combines a look-once static eviction algorithm to discard unimportant tokens and a query-aware dynamic selection algorithm to further select a small subset of tokens for attention computation. As existing dynamic selection algorithms incur too much latency, we propose a series of optimizations to drastically reduce the KV cache selection overhead, including MPC-friendly similarity approximation, hierarchical KV cache clustering, and cross-layer index sharing strategy. With extensive experiments, we demonstrate that MPCache consistently outperforms prior-art KV cache eviction baselines across different LLM generation tasks and achieves 1.8~2.01x and 3.39~8.37x decoding latency and communication reduction on different sequence lengths, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06807v1-abstract-full').style.display = 'none'; document.getElementById('2501.06807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04389">arXiv:2501.04389</a> <span> [<a href="https://arxiv.org/pdf/2501.04389">pdf</a>, <a href="https://arxiv.org/format/2501.04389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Evidence-based multimodal fusion on structured EHRs and free-text notes for ICU outcome prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ruan%2C+Y">Yucheng Ruan</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+D+J">Daniel J. Tan</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+S+K">See Kiong Ng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Ling Huang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04389v1-abstract-short" style="display: inline;"> Objective: Accurate Intensive Care Unit (ICU) outcome prediction is critical for improving patient treatment quality and ICU resource allocation. Existing research mainly focuses on structured data and lacks effective frameworks to integrate clinical notes from heterogeneous electronic health records (EHRs). This study aims to explore a multimodal framework based on evidence theory that can effect… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04389v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04389v1-abstract-full" style="display: none;"> Objective: Accurate Intensive Care Unit (ICU) outcome prediction is critical for improving patient treatment quality and ICU resource allocation. Existing research mainly focuses on structured data and lacks effective frameworks to integrate clinical notes from heterogeneous electronic health records (EHRs). This study aims to explore a multimodal framework based on evidence theory that can effectively combine heterogeneous structured EHRs and free-text notes for accurate and reliable ICU outcome prediction. Materials and Methods: We proposed an evidence-based multimodal fusion framework to predict ICU outcomes, including mortality and prolonged length of stay (PLOS), by utilizing both structured EHR data and free-text notes from the MIMIC-III database. We compare the performance against baseline models that use only structured EHRs, free-text notes, or existing multimodal approaches. Results: The results demonstrate that the evidence-based multimodal fusion model achieved both accurate and reliable prediction. Specifically, it outperformed the best baseline by 1.05%/1.02% in BACC, 9.74%/6.04% in F1 score, 1.28%/0.9% in AUROC, and 6.21%/2.68% in AUPRC for predicting mortality and PLOS, respectively. Additionally, it improved the reliability of the predictions with a 26.8%/15.1% reduction in the Brier score and a 25.0%/13.3% reduction in negative log-likelihood. Conclusion: This study demonstrates that the evidence-based multimodal fusion framework can serve as a strong baseline for predictions using structured EHRs and free-text notes. It effectively reduces false positives, which can help improve the allocation of medical resources in the ICU. This framework can be further applied to analyze multimodal EHRs for other clinical tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04389v1-abstract-full').style.display = 'none'; document.getElementById('2501.04389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02809">arXiv:2501.02809</a> <span> [<a href="https://arxiv.org/pdf/2501.02809">pdf</a>, <a href="https://arxiv.org/format/2501.02809">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Theoretical Data-Driven MobilePosenet: Lightweight Neural Network for Accurate Calibration-Free 5-DOF Magnet Localization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+W">Wenxuan Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuelin Zhang</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+J">Jiwei Shan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongzhe Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiewen Tan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+S+S">Shing Shin Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02809v1-abstract-short" style="display: inline;"> Permanent magnet tracking using the external sensor array is crucial for the accurate localization of wireless capsule endoscope robots. Traditional tracking algorithms, based on the magnetic dipole model and Levenberg-Marquardt (LM) algorithm, face challenges related to computational delays and the need for initial position estimation. More recently proposed neural network-based approaches often… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02809v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02809v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02809v1-abstract-full" style="display: none;"> Permanent magnet tracking using the external sensor array is crucial for the accurate localization of wireless capsule endoscope robots. Traditional tracking algorithms, based on the magnetic dipole model and Levenberg-Marquardt (LM) algorithm, face challenges related to computational delays and the need for initial position estimation. More recently proposed neural network-based approaches often require extensive hardware calibration and real-world data collection, which are time-consuming and labor-intensive. To address these challenges, we propose MobilePosenet, a lightweight neural network architecture that leverages depthwise separable convolutions to minimize computational cost and a channel attention mechanism to enhance localization accuracy. Besides, the inputs to the network integrate the sensors' coordinate information and random noise, compensating for the discrepancies between the theoretical model and the actual magnetic fields and thus allowing MobilePosenet to be trained entirely on theoretical data. Experimental evaluations conducted in a $90 \times 90 \times 80$ mm workspace demonstrate that MobilePosenet exhibits excellent 5-DOF localization accuracy ($1.54 \pm 1.03$ mm and $2.24 \pm 1.84^{\circ}$) and inference speed (0.9 ms) against state-of-the-art methods trained on real-world data. Since network training relies solely on theoretical data, MobilePosenet can eliminate the hardware calibration and real-world data collection process, improving the generalizability of this permanent magnet localization method and the potential for rapid adoption in different clinical settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02809v1-abstract-full').style.display = 'none'; document.getElementById('2501.02809v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02256">arXiv:2501.02256</a> <span> [<a href="https://arxiv.org/pdf/2501.02256">pdf</a>, <a href="https://arxiv.org/format/2501.02256">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Covering Underwater Shadow Zones using Acoustic Reconfigurable Intelligent Surfaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Longfei Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jingbo Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jintao Wang</a>, <a href="/search/cs?searchtype=author&query=Akyildiz%2C+I+F">Ian F. Akyildiz</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Z">Zhi Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02256v1-abstract-short" style="display: inline;"> To better explore the oceans, seamless communication coverage of the vast 3D underwater space is desired. Unlike terrestrial networks using radio signals, underwater acoustic communications face a unique challenge: nodes in underwater shadow zones cannot connect to the network, even within the line of sight. These shadow zones can extend for tens of kilometers, causing communication nodes to disco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02256v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02256v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02256v1-abstract-full" style="display: none;"> To better explore the oceans, seamless communication coverage of the vast 3D underwater space is desired. Unlike terrestrial networks using radio signals, underwater acoustic communications face a unique challenge: nodes in underwater shadow zones cannot connect to the network, even within the line of sight. These shadow zones can extend for tens of kilometers, causing communication nodes to disconnect. Existing efforts focus on passive avoidance of shadow zones, but this strategy cannot ensure seamless coverage in dynamic ocean environments. This paper addresses the shadow zone problem by utilizing acoustic Reconfigurable Intelligent Surfaces (RIS) to actively control the underwater channel. Shadow zones are analytically modeled, and optimal RIS deployment strategies are developed for both deep-sea and shallow-sea environments. The acoustic RIS is redesigned considering practical engineering limitations and validated through pool tests. Bellhop-based simulations show that without RIS deployment, coverage is limited to less than 20%, regardless of source strength. However, with optimal RIS deployment, energy coverage can reach almost 100%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02256v1-abstract-full').style.display = 'none'; document.getElementById('2501.02256v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.00106">arXiv:2501.00106</a> <span> [<a href="https://arxiv.org/pdf/2501.00106">pdf</a>, <a href="https://arxiv.org/format/2501.00106">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> LicenseGPT: A Fine-tuned Foundation Model for Publicly Available Dataset License Compliance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jingwen Tan</a>, <a href="/search/cs?searchtype=author&query=Rajbahadur%2C+G+K">Gopi Krishnan Rajbahadur</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zi Li</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xiangfu Song</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+J">Jianshan Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+A+E">Ahmed E. Hassan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.00106v1-abstract-short" style="display: inline;"> Dataset license compliance is a critical yet complex aspect of developing commercial AI products, particularly with the increasing use of publicly available datasets. Ambiguities in dataset licenses pose significant legal risks, making it challenging even for software IP lawyers to accurately interpret rights and obligations. In this paper, we introduce LicenseGPT, a fine-tuned foundation model (F… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00106v1-abstract-full').style.display = 'inline'; document.getElementById('2501.00106v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.00106v1-abstract-full" style="display: none;"> Dataset license compliance is a critical yet complex aspect of developing commercial AI products, particularly with the increasing use of publicly available datasets. Ambiguities in dataset licenses pose significant legal risks, making it challenging even for software IP lawyers to accurately interpret rights and obligations. In this paper, we introduce LicenseGPT, a fine-tuned foundation model (FM) specifically designed for dataset license compliance analysis. We first evaluate existing legal FMs (i.e., FMs specialized in understanding and processing legal texts) and find that the best-performing model achieves a Prediction Agreement (PA) of only 43.75%. LicenseGPT, fine-tuned on a curated dataset of 500 licenses annotated by legal experts, significantly improves PA to 64.30%, outperforming both legal and general-purpose FMs. Through an A/B test and user study with software IP lawyers, we demonstrate that LicenseGPT reduces analysis time by 94.44%, from 108 seconds to 6 seconds per license, without compromising accuracy. Software IP lawyers perceive LicenseGPT as a valuable supplementary tool that enhances efficiency while acknowledging the need for human oversight in complex cases. Our work underscores the potential of specialized AI tools in legal practice and offers a publicly available resource for practitioners and researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.00106v1-abstract-full').style.display = 'none'; document.getElementById('2501.00106v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20430">arXiv:2412.20430</a> <span> [<a href="https://arxiv.org/pdf/2412.20430">pdf</a>, <a href="https://arxiv.org/format/2412.20430">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Unlocking adaptive digital pathology through dynamic feature learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Q">Qingxin Xia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qiang Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihan Wang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+Z">Zhiyuan Shen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yifei Ma</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zimo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zhe Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tiandong Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Junbo Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xueqian Wang</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+X">Xiu-Wu Bian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Lingchuan Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Chao He</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20430v1-abstract-short" style="display: inline;"> Foundation models have revolutionized the paradigm of digital pathology, as they leverage general-purpose features to emulate real-world pathological practices, enabling the quantitative analysis of critical histological patterns and the dissection of cancer-specific signals. However, these static general features constrain the flexibility and pathological relevance in the ever-evolving needs of c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20430v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20430v1-abstract-full" style="display: none;"> Foundation models have revolutionized the paradigm of digital pathology, as they leverage general-purpose features to emulate real-world pathological practices, enabling the quantitative analysis of critical histological patterns and the dissection of cancer-specific signals. However, these static general features constrain the flexibility and pathological relevance in the ever-evolving needs of clinical applications, hindering the broad use of the current models. Here we introduce PathFiT, a dynamic feature learning method that can be effortlessly plugged into various pathology foundation models to unlock their adaptability. Meanwhile, PathFiT performs seamless implementation across diverse pathology applications regardless of downstream specificity. To validate PathFiT, we construct a digital pathology benchmark with over 20 terabytes of Internet and real-world data comprising 28 H\&E-stained tasks and 7 specialized imaging tasks including Masson's Trichrome staining and immunofluorescence images. By applying PathFiT to the representative pathology foundation models, we demonstrate state-of-the-art performance on 34 out of 35 tasks, with significant improvements on 23 tasks and outperforming by 10.20% on specialized imaging tasks. The superior performance and versatility of PathFiT open up new avenues in computational pathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20430v1-abstract-full').style.display = 'none'; document.getElementById('2412.20430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">49 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.20368">arXiv:2412.20368</a> <span> [<a href="https://arxiv.org/pdf/2412.20368">pdf</a>, <a href="https://arxiv.org/format/2412.20368">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Subconscious Robotic Imitation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jun Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhicheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jianwei Tan</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+H">Huanxu Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xiaoguang Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.20368v1-abstract-short" style="display: inline;"> Although robotic imitation learning (RIL) is promising for embodied intelligent robots, existing RIL approaches rely on computationally intensive multi-model trajectory predictions, resulting in slow execution and limited real-time responsiveness. Instead, human beings subconscious can constantly process and store vast amounts of information from their experiences, perceptions, and learning, allow… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20368v1-abstract-full').style.display = 'inline'; document.getElementById('2412.20368v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.20368v1-abstract-full" style="display: none;"> Although robotic imitation learning (RIL) is promising for embodied intelligent robots, existing RIL approaches rely on computationally intensive multi-model trajectory predictions, resulting in slow execution and limited real-time responsiveness. Instead, human beings subconscious can constantly process and store vast amounts of information from their experiences, perceptions, and learning, allowing them to fulfill complex actions such as riding a bike, without consciously thinking about each. Inspired by this phenomenon in action neurology, we introduced subconscious robotic imitation learning (SRIL), wherein cognitive offloading was combined with historical action chunkings to reduce delays caused by model inferences, thereby accelerating task execution. This process was further enhanced by subconscious downsampling and pattern augmented learning policy wherein intent-rich information was addressed with quantized sampling techniques to improve manipulation efficiency. Experimental results demonstrated that execution speeds of the SRIL were 100\% to 200\% faster over SOTA policies for comprehensive dual-arm tasks, with consistently higher success rates. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.20368v1-abstract-full').style.display = 'none'; document.getElementById('2412.20368v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17735">arXiv:2412.17735</a> <span> [<a href="https://arxiv.org/pdf/2412.17735">pdf</a>, <a href="https://arxiv.org/format/2412.17735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Combinatorics">math.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Colouring t-perfect graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chudnovsky%2C+M">Maria Chudnovsky</a>, <a href="/search/cs?searchtype=author&query=Cook%2C+L">Linda Cook</a>, <a href="/search/cs?searchtype=author&query=Davies%2C+J">James Davies</a>, <a href="/search/cs?searchtype=author&query=Oum%2C+S">Sang-il Oum</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jane Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17735v1-abstract-short" style="display: inline;"> Perfect graphs can be described as the graphs whose stable set polytopes are defined by their non-negativity and clique inequalities (including edge inequalities). In 1975, Chv谩tal defined an analogous class of t-perfect graphs, which are the graphs whose stable set polytopes are defined by their non-negativity, edge inequalities, and odd circuit inequalities. We show that t-perfect graphs are… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17735v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17735v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17735v1-abstract-full" style="display: none;"> Perfect graphs can be described as the graphs whose stable set polytopes are defined by their non-negativity and clique inequalities (including edge inequalities). In 1975, Chv谩tal defined an analogous class of t-perfect graphs, which are the graphs whose stable set polytopes are defined by their non-negativity, edge inequalities, and odd circuit inequalities. We show that t-perfect graphs are $199053$-colourable. This is the first finite bound on the chromatic number of t-perfect graphs and answers a question of Shepherd from 1995. Our proof also shows that every h-perfect graph with clique number $蠅$ is $(蠅+ 199050)$-colourable. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17735v1-abstract-full').style.display = 'none'; document.getElementById('2412.17735v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">23 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.17238">arXiv:2412.17238</a> <span> [<a href="https://arxiv.org/pdf/2412.17238">pdf</a>, <a href="https://arxiv.org/format/2412.17238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Modality-Aware Shot Relating and Comparing for Video Scene Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiawei Tan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongxing Wang</a>, <a href="/search/cs?searchtype=author&query=Dang%2C+K">Kang Dang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiaxin Li</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+Z">Zhilong Ou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.17238v1-abstract-short" style="display: inline;"> Video scene detection involves assessing whether each shot and its surroundings belong to the same scene. Achieving this requires meticulously correlating multi-modal cues, $\it{e.g.}$ visual entity and place modalities, among shots and comparing semantic changes around each shot. However, most methods treat multi-modal semantics equally and do not examine contextual differences between the two si… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17238v1-abstract-full').style.display = 'inline'; document.getElementById('2412.17238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.17238v1-abstract-full" style="display: none;"> Video scene detection involves assessing whether each shot and its surroundings belong to the same scene. Achieving this requires meticulously correlating multi-modal cues, $\it{e.g.}$ visual entity and place modalities, among shots and comparing semantic changes around each shot. However, most methods treat multi-modal semantics equally and do not examine contextual differences between the two sides of a shot, leading to sub-optimal detection performance. In this paper, we propose the $\bf{M}$odality-$\bf{A}$ware $\bf{S}$hot $\bf{R}$elating and $\bf{C}$omparing approach (MASRC), which enables relating shots per their own characteristics of visual entity and place modalities, as well as comparing multi-shots similarities to have scene changes explicitly encoded. Specifically, to fully harness the potential of visual entity and place modalities in modeling shot relations, we mine long-term shot correlations from entity semantics while simultaneously revealing short-term shot correlations from place semantics. In this way, we can learn distinctive shot features that consolidate coherence within scenes and amplify distinguishability across scenes. Once equipped with distinctive shot features, we further encode the relations between preceding and succeeding shots of each target shot by similarity convolution, aiding in the identification of scene ending shots. We validate the broad applicability of the proposed components in MASRC. Extensive experimental results on public benchmark datasets demonstrate that the proposed MASRC significantly advances video scene detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.17238v1-abstract-full').style.display = 'none'; document.getElementById('2412.17238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14197">arXiv:2412.14197</a> <span> [<a href="https://arxiv.org/pdf/2412.14197">pdf</a>, <a href="https://arxiv.org/format/2412.14197">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Advancing Vehicle Plate Recognition: Multitasking Visual Language Models with VehiclePaliGemma </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=AlDahoul%2C+N">Nouar AlDahoul</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Tera%2C+R+R">Raghava Reddy Tera</a>, <a href="/search/cs?searchtype=author&query=Karim%2C+H+A">Hezerul Abdul Karim</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+C+H">Chee How Lim</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+M+K">Manish Kumar Mishra</a>, <a href="/search/cs?searchtype=author&query=Zaki%2C+Y">Yasir Zaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14197v1-abstract-short" style="display: inline;"> License plate recognition (LPR) involves automated systems that utilize cameras and computer vision to read vehicle license plates. Such plates collected through LPR can then be compared against databases to identify stolen vehicles, uninsured drivers, crime suspects, and more. The LPR system plays a significant role in saving time for institutions such as the police force. In the past, LPR relied… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14197v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14197v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14197v1-abstract-full" style="display: none;"> License plate recognition (LPR) involves automated systems that utilize cameras and computer vision to read vehicle license plates. Such plates collected through LPR can then be compared against databases to identify stolen vehicles, uninsured drivers, crime suspects, and more. The LPR system plays a significant role in saving time for institutions such as the police force. In the past, LPR relied heavily on Optical Character Recognition (OCR), which has been widely explored to recognize characters in images. Usually, collected plate images suffer from various limitations, including noise, blurring, weather conditions, and close characters, making the recognition complex. Existing LPR methods still require significant improvement, especially for distorted images. To fill this gap, we propose utilizing visual language models (VLMs) such as OpenAI GPT4o, Google Gemini 1.5, Google PaliGemma (Pathways Language and Image model + Gemma model), Meta Llama 3.2, Anthropic Claude 3.5 Sonnet, LLaVA, NVIDIA VILA, and moondream2 to recognize such unclear plates with close characters. This paper evaluates the VLM's capability to address the aforementioned problems. Additionally, we introduce ``VehiclePaliGemma'', a fine-tuned Open-sourced PaliGemma VLM designed to recognize plates under challenging conditions. We compared our proposed VehiclePaliGemma with state-of-the-art methods and other VLMs using a dataset of Malaysian license plates collected under complex conditions. The results indicate that VehiclePaliGemma achieved superior performance with an accuracy of 87.6\%. Moreover, it is able to predict the car's plate at a speed of 7 frames per second using A100-80GB GPU. Finally, we explored the multitasking capability of VehiclePaliGemma model to accurately identify plates containing multiple cars of various models and colors, with plates positioned and oriented in different directions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14197v1-abstract-full').style.display = 'none'; document.getElementById('2412.14197v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">33 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.13018">arXiv:2412.13018</a> <span> [<a href="https://arxiv.org/pdf/2412.13018">pdf</a>, <a href="https://arxiv.org/format/2412.13018">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OmniEval: An Omnidirectional and Automatic RAG Evaluation Benchmark in Financial Domain </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuting Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiejun Tan</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhicheng Dou</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.13018v1-abstract-short" style="display: inline;"> As a typical and practical application of Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) techniques have gained extensive attention, particularly in vertical domains where LLMs may lack domain-specific knowledge. In this paper, we introduce an omnidirectional and automatic RAG benchmark, OmniEval, in the financial domain. Our benchmark is characterized by its multi-dimensional… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13018v1-abstract-full').style.display = 'inline'; document.getElementById('2412.13018v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.13018v1-abstract-full" style="display: none;"> As a typical and practical application of Large Language Models (LLMs), Retrieval-Augmented Generation (RAG) techniques have gained extensive attention, particularly in vertical domains where LLMs may lack domain-specific knowledge. In this paper, we introduce an omnidirectional and automatic RAG benchmark, OmniEval, in the financial domain. Our benchmark is characterized by its multi-dimensional evaluation framework, including (1) a matrix-based RAG scenario evaluation system that categorizes queries into five task classes and 16 financial topics, leading to a structured assessment of diverse query scenarios; (2) a multi-dimensional evaluation data generation approach, which combines GPT-4-based automatic generation and human annotation, achieving an 87.47\% acceptance ratio in human evaluations on generated instances; (3) a multi-stage evaluation system that evaluates both retrieval and generation performance, result in a comprehensive evaluation on the RAG pipeline; and (4) robust evaluation metrics derived from rule-based and LLM-based ones, enhancing the reliability of assessments through manual annotations and supervised fine-tuning of an LLM evaluator. Our experiments demonstrate the comprehensiveness of OmniEval, which includes extensive test datasets and highlights the performance variations of RAG systems across diverse topics and tasks, revealing significant opportunities for RAG models to improve their capabilities in vertical domains. We open source the code of our benchmark in \href{https://github.com/RUC-NLPIR/OmniEval}{https://github.com/RUC-NLPIR/OmniEval}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.13018v1-abstract-full').style.display = 'none'; document.getElementById('2412.13018v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12083">arXiv:2412.12083</a> <span> [<a href="https://arxiv.org/pdf/2412.12083">pdf</a>, <a href="https://arxiv.org/format/2412.12083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IDArb: Intrinsic Decomposition for Arbitrary Number of Input Views and Illuminations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhibing Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jing Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengchen Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12083v1-abstract-short" style="display: inline;"> Capturing geometric and material information from images remains a fundamental challenge in computer vision and graphics. Traditional optimization-based methods often require hours of computational time to reconstruct geometry, material properties, and environmental lighting from dense multi-view inputs, while still struggling with inherent ambiguities between lighting and material. On the other h… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12083v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12083v1-abstract-full" style="display: none;"> Capturing geometric and material information from images remains a fundamental challenge in computer vision and graphics. Traditional optimization-based methods often require hours of computational time to reconstruct geometry, material properties, and environmental lighting from dense multi-view inputs, while still struggling with inherent ambiguities between lighting and material. On the other hand, learning-based approaches leverage rich material priors from existing 3D object datasets but face challenges with maintaining multi-view consistency. In this paper, we introduce IDArb, a diffusion-based model designed to perform intrinsic decomposition on an arbitrary number of images under varying illuminations. Our method achieves accurate and multi-view consistent estimation on surface normals and material properties. This is made possible through a novel cross-view, cross-domain attention module and an illumination-augmented, view-adaptive training strategy. Additionally, we introduce ARB-Objaverse, a new dataset that provides large-scale multi-view intrinsic data and renderings under diverse lighting conditions, supporting robust training. Extensive experiments demonstrate that IDArb outperforms state-of-the-art methods both qualitatively and quantitatively. Moreover, our approach facilitates a range of downstream tasks, including single-image relighting, photometric stereo, and 3D reconstruction, highlighting its broad applications in realistic 3D content creation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12083v1-abstract-full').style.display = 'none'; document.getElementById('2412.12083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11939">arXiv:2412.11939</a> <span> [<a href="https://arxiv.org/pdf/2412.11939">pdf</a>, <a href="https://arxiv.org/format/2412.11939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> SEAGraph: Unveiling the Whole Story of Paper Review Comments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Jianxiang Yu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiaqi Tan</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zichen Ding</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jiapeng Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahao Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yao Cheng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Q">Qier Cui</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+Y">Yunshi Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11939v1-abstract-short" style="display: inline;"> Peer review, as a cornerstone of scientific research, ensures the integrity and quality of scholarly work by providing authors with objective feedback for refinement. However, in the traditional peer review process, authors often receive vague or insufficiently detailed feedback, which provides limited assistance and leads to a more time-consuming review cycle. If authors can identify some specifi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11939v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11939v1-abstract-full" style="display: none;"> Peer review, as a cornerstone of scientific research, ensures the integrity and quality of scholarly work by providing authors with objective feedback for refinement. However, in the traditional peer review process, authors often receive vague or insufficiently detailed feedback, which provides limited assistance and leads to a more time-consuming review cycle. If authors can identify some specific weaknesses in their paper, they can not only address the reviewer's concerns but also improve their work. This raises the critical question of how to enhance authors' comprehension of review comments. In this paper, we present SEAGraph, a novel framework developed to clarify review comments by uncovering the underlying intentions behind them. We construct two types of graphs for each paper: the semantic mind graph, which captures the author's thought process, and the hierarchical background graph, which delineates the research domains related to the paper. A retrieval method is then designed to extract relevant content from both graphs, facilitating coherent explanations for the review comments. Extensive experiments show that SEAGraph excels in review comment understanding tasks, offering significant benefits to authors. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11939v1-abstract-full').style.display = 'none'; document.getElementById('2412.11939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.11869">arXiv:2412.11869</a> <span> [<a href="https://arxiv.org/pdf/2412.11869">pdf</a>, <a href="https://arxiv.org/format/2412.11869">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Automated Detection of Inter-Language Design Smells in Multi-Language Deep Learning Frameworks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zengyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaoyong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenshuo Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+P">Peng Liang</a>, <a href="/search/cs?searchtype=author&query=Mo%2C+R">Ran Mo</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.11869v1-abstract-short" style="display: inline;"> Nowadays, most DL frameworks (DLFs) use multilingual programming of Python and C/C++, facilitating the flexibility and performance of the DLF. However, inappropriate interlanguage interaction may introduce design smells involving multiple programming languages (PLs), i.e., Inter-Language Design Smells (ILDS). Despite the negative impact of ILDS on multi-language DLFs, there is a lack of an automat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11869v1-abstract-full').style.display = 'inline'; document.getElementById('2412.11869v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.11869v1-abstract-full" style="display: none;"> Nowadays, most DL frameworks (DLFs) use multilingual programming of Python and C/C++, facilitating the flexibility and performance of the DLF. However, inappropriate interlanguage interaction may introduce design smells involving multiple programming languages (PLs), i.e., Inter-Language Design Smells (ILDS). Despite the negative impact of ILDS on multi-language DLFs, there is a lack of an automated approach for detecting ILDS in multi-language DLFs and a comprehensive understanding on ILDS in such DLFs. This work automatically detects ILDS in multi-language DLFs written in the combination of Python and C/C++, and to obtain a understanding on such ILDS in DLFs. We first developed an approach to automatically detecting ILDS in the multi-language DLFs written in the combination of Python and C/C++, including a number of ILDS and their detection rules defined based on inter-language communication mechanisms and code analysis. We then developed the CPSMELL tool that implements detection rules for automatically detecting such ILDS, and manually validated the accuracy of the tool. Finally, we performed a study to evaluate the ILDS in multi-language DLFs. We proposed seven ILDS and achieved an accuracy of 98.17% in the manual validation of CPSMELL in 5 popular multi-language DLFs. The study results revealed that among the 5 DLFs, TensorFlow, PyTorch, and PaddlePaddle exhibit relatively high prevalence of ILDS; each smelly file contains around 5 ILDS instances on average, with ILDS Long Lambda Function For Inter-language Binding and Unused Native Entity being relatively prominent; throughout the evolution process of the 5 DLFs, some ILDS were resolved to a certain extent, but the overall count of ILDS instances shows an upward trend. The automated detection of the proposed ILDS achieved a high accuracy, and the study provides a comprehensive understanding on ILDS in the multi-language DLFs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.11869v1-abstract-full').style.display = 'none'; document.getElementById('2412.11869v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Preprint accepted for publication in Information and Software Technology, 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07548">arXiv:2412.07548</a> <span> [<a href="https://arxiv.org/pdf/2412.07548">pdf</a>, <a href="https://arxiv.org/format/2412.07548">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Automatic Database Configuration Debugging using Retrieval-Augmented Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Sibei Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+J">Ju Fan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+B">Bin Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+N">Nan Tang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+C">Chao Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyi Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ye Li</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jian Tan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Feifei Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jingren Zhou</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaoyong Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07548v2-abstract-short" style="display: inline;"> Database management system (DBMS) configuration debugging, e.g., diagnosing poorly configured DBMS knobs and generating troubleshooting recommendations, is crucial in optimizing DBMS performance. However, the configuration debugging process is tedious and, sometimes challenging, even for seasoned database administrators (DBAs) with sufficient experience in DBMS configurations and good understandin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07548v2-abstract-full').style.display = 'inline'; document.getElementById('2412.07548v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07548v2-abstract-full" style="display: none;"> Database management system (DBMS) configuration debugging, e.g., diagnosing poorly configured DBMS knobs and generating troubleshooting recommendations, is crucial in optimizing DBMS performance. However, the configuration debugging process is tedious and, sometimes challenging, even for seasoned database administrators (DBAs) with sufficient experience in DBMS configurations and good understandings of the DBMS internals (e.g., MySQL or Oracle). To address this difficulty, we propose Andromeda, a framework that utilizes large language models (LLMs) to enable automatic DBMS configuration debugging. Andromeda serves as a natural surrogate of DBAs to answer a wide range of natural language (NL) questions on DBMS configuration issues, and to generate diagnostic suggestions to fix these issues. Nevertheless, directly prompting LLMs with these professional questions may result in overly generic and often unsatisfying answers. To this end, we propose a retrieval-augmented generation (RAG) strategy that effectively provides matched domain-specific contexts for the question from multiple sources. They come from related historical questions, troubleshooting manuals and DBMS telemetries, which significantly improve the performance of configuration debugging. To support the RAG strategy, we develop a document retrieval mechanism addressing heterogeneous documents and design an effective method for telemetry analysis. Extensive experiments on real-world DBMS configuration debugging datasets show that Andromeda significantly outperforms existing solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07548v2-abstract-full').style.display = 'none'; document.getElementById('2412.07548v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05479">arXiv:2412.05479</a> <span> [<a href="https://arxiv.org/pdf/2412.05479">pdf</a>, <a href="https://arxiv.org/format/2412.05479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TACO: Learning Multi-modal Action Models with Synthetic Chains-of-Thought-and-Action </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zixian Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jieyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+M">Manli Shu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Krishna%2C+R">Ranjay Krishna</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05479v2-abstract-short" style="display: inline;"> While open-source multi-modal language models perform well on simple question answering tasks, they often fail on complex questions that require multiple capabilities, such as fine-grained recognition, visual grounding, and reasoning, and that demand multi-step solutions. We present TACO, a family of multi-modal large action models designed to improve performance on such complex, multi-step, and m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05479v2-abstract-full').style.display = 'inline'; document.getElementById('2412.05479v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05479v2-abstract-full" style="display: none;"> While open-source multi-modal language models perform well on simple question answering tasks, they often fail on complex questions that require multiple capabilities, such as fine-grained recognition, visual grounding, and reasoning, and that demand multi-step solutions. We present TACO, a family of multi-modal large action models designed to improve performance on such complex, multi-step, and multi-modal tasks. During inference, TACO produces chains-of-thought-and-action (CoTA), executes intermediate steps by invoking external tools such as OCR, depth estimation and calculator, then integrates both the thoughts and action outputs to produce coherent responses. To train TACO, we create a large dataset of over 1M synthetic CoTA traces generated with GPT-4o and Python programs. We then experiment with various data filtering and mixing techniques and obtain a final subset of 293K high-quality CoTA examples. This dataset enables TACO to learn complex reasoning and action paths, surpassing existing models trained on instruction tuning data with only direct answers. Our model TACO outperforms the instruction-tuned baseline across 8 benchmarks, achieving a 3.6% improvement on average, with gains of up to 15% in MMVet tasks involving OCR, mathematical reasoning, and spatial reasoning. Training on high-quality CoTA traces sets a new standard for complex multi-modal reasoning, highlighting the need for structured, multi-step instruction tuning in advancing open-source mutli-modal models' capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05479v2-abstract-full').style.display = 'none'; document.getElementById('2412.05479v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.04434">arXiv:2412.04434</a> <span> [<a href="https://arxiv.org/pdf/2412.04434">pdf</a>, <a href="https://arxiv.org/format/2412.04434">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Real-Time Open-Vocabulary Video Instance Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+B">Bin Yan</a>, <a href="/search/cs?searchtype=author&query=Sundermeyer%2C+M">Martin Sundermeyer</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+D+J">David Joseph Tan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Tombari%2C+F">Federico Tombari</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.04434v1-abstract-short" style="display: inline;"> In this paper, we address the challenge of performing open-vocabulary video instance segmentation (OV-VIS) in real-time. We analyze the computational bottlenecks of state-of-the-art foundation models that performs OV-VIS, and propose a new method, TROY-VIS, that significantly improves processing speed while maintaining high accuracy. We introduce three key techniques: (1) Decoupled Attention Featu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04434v1-abstract-full').style.display = 'inline'; document.getElementById('2412.04434v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.04434v1-abstract-full" style="display: none;"> In this paper, we address the challenge of performing open-vocabulary video instance segmentation (OV-VIS) in real-time. We analyze the computational bottlenecks of state-of-the-art foundation models that performs OV-VIS, and propose a new method, TROY-VIS, that significantly improves processing speed while maintaining high accuracy. We introduce three key techniques: (1) Decoupled Attention Feature Enhancer to speed up information interaction between different modalities and scales; (2) Flash Embedding Memory for obtaining fast text embeddings of object categories; and, (3) Kernel Interpolation for exploiting the temporal continuity in videos. Our experiments demonstrate that TROY-VIS achieves the best trade-off between accuracy and speed on two large-scale OV-VIS benchmarks, BURST and LV-VIS, running 20x faster than GLEE-Lite (25 FPS v.s. 1.25 FPS) with comparable or even better accuracy. These results demonstrate TROY-VIS's potential for real-time applications in dynamic environments such as mobile robotics and augmented reality. Code and model will be released at https://github.com/google-research/troyvis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.04434v1-abstract-full').style.display = 'none'; document.getElementById('2412.04434v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03552">arXiv:2412.03552</a> <span> [<a href="https://arxiv.org/pdf/2412.03552">pdf</a>, <a href="https://arxiv.org/format/2412.03552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Imagine360: Immersive 360 Video Generation from Perspective Anchor </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jing Tan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shuai Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tong Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Jingwen He</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Y">Yuwei Guo</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Ziwei Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dahua Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03552v1-abstract-short" style="display: inline;"> $360^\circ$ videos offer a hyper-immersive experience that allows the viewers to explore a dynamic scene from full 360 degrees. To achieve more user-friendly and personalized content creation in $360^\circ$ video format, we seek to lift standard perspective videos into $360^\circ$ equirectangular videos. To this end, we introduce Imagine360, the first perspective-to-$360^\circ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03552v1-abstract-full').style.display = 'inline'; document.getElementById('2412.03552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03552v1-abstract-full" style="display: none;"> $360^\circ$ videos offer a hyper-immersive experience that allows the viewers to explore a dynamic scene from full 360 degrees. To achieve more user-friendly and personalized content creation in $360^\circ$ video format, we seek to lift standard perspective videos into $360^\circ$ equirectangular videos. To this end, we introduce Imagine360, the first perspective-to-$360^\circ$ video generation framework that creates high-quality $360^\circ$ videos with rich and diverse motion patterns from video anchors. Imagine360 learns fine-grained spherical visual and motion patterns from limited $360^\circ$ video data with several key designs. 1) Firstly we adopt the dual-branch design, including a perspective and a panorama video denoising branch to provide local and global constraints for $360^\circ$ video generation, with motion module and spatial LoRA layers fine-tuned on extended web $360^\circ$ videos. 2) Additionally, an antipodal mask is devised to capture long-range motion dependencies, enhancing the reversed camera motion between antipodal pixels across hemispheres. 3) To handle diverse perspective video inputs, we propose elevation-aware designs that adapt to varying video masking due to changing elevations across frames. Extensive experiments show Imagine360 achieves superior graphics quality and motion coherence among state-of-the-art $360^\circ$ video generation methods. We believe Imagine360 holds promise for advancing personalized, immersive $360^\circ$ video creation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03552v1-abstract-full').style.display = 'none'; document.getElementById('2412.03552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://ys-imtech.github.io/projects/Imagine360</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.03409">arXiv:2412.03409</a> <span> [<a href="https://arxiv.org/pdf/2412.03409">pdf</a>, <a href="https://arxiv.org/format/2412.03409">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PrefixKV: Adaptive Prefix KV Cache is What Vision Instruction-Following Models Need for Efficient Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+A">Ao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hui Chen</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jianchao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kefeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zijia Lin</a>, <a href="/search/cs?searchtype=author&query=Han%2C+J">Jungong Han</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+G">Guiguang Ding</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.03409v2-abstract-short" style="display: inline;"> Recently, large vision-language models (LVLMs) have rapidly gained popularity for their strong generation and reasoning capabilities given diverse multimodal inputs. However, these models incur significant computational and memory overhead during inference, which greatly hinders the efficient deployment in practical scenarios. The extensive key-value (KV) cache, necessitated by the lengthy input a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03409v2-abstract-full').style.display = 'inline'; document.getElementById('2412.03409v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.03409v2-abstract-full" style="display: none;"> Recently, large vision-language models (LVLMs) have rapidly gained popularity for their strong generation and reasoning capabilities given diverse multimodal inputs. However, these models incur significant computational and memory overhead during inference, which greatly hinders the efficient deployment in practical scenarios. The extensive key-value (KV) cache, necessitated by the lengthy input and output sequences, notably contributes to the high inference cost. Based on this, recent works have investigated ways to reduce the KV cache size for higher efficiency. Although effective, they generally overlook the distinct importance distributions of KV vectors across layers and maintain the same cache size for each layer during the next token prediction. This results in the significant contextual information loss for certain layers, leading to notable performance decline. To address this, we present PrefixKV. It reframes the challenge of determining KV cache sizes for all layers into the task of searching for the optimal global prefix configuration. With an adaptive layer-wise KV retention recipe based on binary search, the maximum contextual information can thus be preserved in each layer, facilitating the generation. Extensive experiments demonstrate that our method achieves the state-of-the-art performance compared with others. It exhibits superior inference efficiency and generation quality trade-offs, showing promising potential for practical applications. Code is available at \url{https://github.com/THU-MIG/PrefixKV}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.03409v2-abstract-full').style.display = 'none'; document.getElementById('2412.03409v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 5 figures;</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17123">arXiv:2411.17123</a> <span> [<a href="https://arxiv.org/pdf/2411.17123">pdf</a>, <a href="https://arxiv.org/format/2411.17123">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Advancing Content Moderation: Evaluating Large Language Models for Detecting Sensitive Content Across Text, Images, and Videos </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=AlDahoul%2C+N">Nouar AlDahoul</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Kasireddy%2C+H+R">Harishwar Reddy Kasireddy</a>, <a href="/search/cs?searchtype=author&query=Zaki%2C+Y">Yasir Zaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17123v1-abstract-short" style="display: inline;"> The widespread dissemination of hate speech, harassment, harmful and sexual content, and violence across websites and media platforms presents substantial challenges and provokes widespread concern among different sectors of society. Governments, educators, and parents are often at odds with media platforms about how to regulate, control, and limit the spread of such content. Technologies for dete… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17123v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17123v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17123v1-abstract-full" style="display: none;"> The widespread dissemination of hate speech, harassment, harmful and sexual content, and violence across websites and media platforms presents substantial challenges and provokes widespread concern among different sectors of society. Governments, educators, and parents are often at odds with media platforms about how to regulate, control, and limit the spread of such content. Technologies for detecting and censoring the media contents are a key solution to addressing these challenges. Techniques from natural language processing and computer vision have been used widely to automatically identify and filter out sensitive content such as offensive languages, violence, nudity, and addiction in both text, images, and videos, enabling platforms to enforce content policies at scale. However, existing methods still have limitations in achieving high detection accuracy with fewer false positives and false negatives. Therefore, more sophisticated algorithms for understanding the context of both text and image may open rooms for improvement in content censorship to build a more efficient censorship system. In this paper, we evaluate existing LLM-based content moderation solutions such as OpenAI moderation model and Llama-Guard3 and study their capabilities to detect sensitive contents. Additionally, we explore recent LLMs such as GPT, Gemini, and Llama in identifying inappropriate contents across media outlets. Various textual and visual datasets like X tweets, Amazon reviews, news articles, human photos, cartoons, sketches, and violence videos have been utilized for evaluation and comparison. The results demonstrate that LLMs outperform traditional techniques by achieving higher accuracy and lower false positive and false negative rates. This highlights the potential to integrate LLMs into websites, social media platforms, and video-sharing services for regulatory and content moderation purposes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17123v1-abstract-full').style.display = 'none'; document.getElementById('2411.17123v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">55 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17052">arXiv:2411.17052</a> <span> [<a href="https://arxiv.org/pdf/2411.17052">pdf</a>, <a href="https://arxiv.org/format/2411.17052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Programming-Based Offline Redundancy Resolution of Redundant Manipulators Along Prescribed Paths with Real-Time Adjustment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhihang Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fa Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianmin Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiyong Tan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Dexing Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17052v1-abstract-short" style="display: inline;"> Traditional offline redundancy resolution of trajectories for redundant manipulators involves computing inverse kinematic solutions for Cartesian space paths, constraining the manipulator to a fixed path without real-time adjustments. Online redundancy resolution can achieve real-time adjustment of paths, but it cannot consider subsequent path points, leading to the possibility of the manipulator… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17052v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17052v1-abstract-full" style="display: none;"> Traditional offline redundancy resolution of trajectories for redundant manipulators involves computing inverse kinematic solutions for Cartesian space paths, constraining the manipulator to a fixed path without real-time adjustments. Online redundancy resolution can achieve real-time adjustment of paths, but it cannot consider subsequent path points, leading to the possibility of the manipulator being forced to stop mid-motion due to joint constraints. To address this, this paper introduces a dynamic programming-based offline redundancy resolution for redundant manipulators along prescribed paths with real-time adjustment. The proposed method allows the manipulator to move along a prescribed path while implementing real-time adjustment along the normal to the path. Using Dynamic Programming, the proposed approach computes a global maximum for the variation of adjustment coefficients. As long as the coefficient variation between adjacent sampling path points does not exceed this limit, the algorithm provides the next path point's joint angles based on the current joint angles, enabling the end-effector to achieve the adjusted Cartesian pose. The main innovation of this paper lies in augmenting traditional offline optimal planning with real-time adjustment capabilities, achieving a fusion of offline planning and online planning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17052v1-abstract-full').style.display = 'none'; document.getElementById('2411.17052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17034">arXiv:2411.17034</a> <span> [<a href="https://arxiv.org/pdf/2411.17034">pdf</a>, <a href="https://arxiv.org/format/2411.17034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Dynamic Programming-Based Redundancy Resolution for Path Planning of Redundant Manipulators Considering Breakpoints </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhihang Yin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+F">Fa Wu</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+R">Ruofan Bian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziqian Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jianmin Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiyong Tan</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+D">Dexing Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17034v1-abstract-short" style="display: inline;"> This paper proposes a redundancy resolution algorithm for a redundant manipulator based on dynamic programming. This algorithm can compute the desired joint angles at each point on a pre-planned discrete path in Cartesian space, while ensuring that the angles, velocities, and accelerations of each joint do not exceed the manipulator's constraints. We obtain the analytical solution to the inverse k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17034v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17034v1-abstract-full" style="display: none;"> This paper proposes a redundancy resolution algorithm for a redundant manipulator based on dynamic programming. This algorithm can compute the desired joint angles at each point on a pre-planned discrete path in Cartesian space, while ensuring that the angles, velocities, and accelerations of each joint do not exceed the manipulator's constraints. We obtain the analytical solution to the inverse kinematics problem of the manipulator using a parameterization method, transforming the redundancy resolution problem into an optimization problem of determining the parameters at each path point. The constraints on joint velocity and acceleration serve as constraints for the optimization problem. Then all feasible inverse kinematic solutions for each pose under the joint angle constraints of the manipulator are obtained through parameterization methods, and the globally optimal solution to this problem is obtained through the dynamic programming algorithm. On the other hand, if a feasible joint-space path satisfying the constraints does not exist, the proposed algorithm can compute the minimum number of breakpoints required for the path and partition the path with as few breakpoints as possible to facilitate the manipulator's operation along the path. The algorithm can also determine the optimal selection of breakpoints to minimize the global cost function, rather than simply interrupting when the manipulator is unable to continue operating. The proposed algorithm is tested using a manipulator produced by a certain manufacturer, demonstrating the effectiveness of the algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17034v1-abstract-full').style.display = 'none'; document.getElementById('2411.17034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15707">arXiv:2411.15707</a> <span> [<a href="https://arxiv.org/pdf/2411.15707">pdf</a>, <a href="https://arxiv.org/format/2411.15707">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Nimbus: Secure and Efficient Two-Party Inference for Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhengyi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+K">Kang Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wen-jie Lu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haoqi Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiao Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yu Yu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Derun Zhao</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yancheng Zheng</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+M">Minyi Guo</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+J">Jingwen Leng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15707v1-abstract-short" style="display: inline;"> Transformer models have gained significant attention due to their power in machine learning tasks. Their extensive deployment has raised concerns about the potential leakage of sensitive information during inference. However, when being applied to Transformers, existing approaches based on secure two-party computation (2PC) bring about efficiency limitations in two folds: (1) resource-intensive ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15707v1-abstract-full" style="display: none;"> Transformer models have gained significant attention due to their power in machine learning tasks. Their extensive deployment has raised concerns about the potential leakage of sensitive information during inference. However, when being applied to Transformers, existing approaches based on secure two-party computation (2PC) bring about efficiency limitations in two folds: (1) resource-intensive matrix multiplications in linear layers, and (2) complex non-linear activation functions like $\mathsf{GELU}$ and $\mathsf{Softmax}$. This work presents a new two-party inference framework $\mathsf{Nimbus}$ for Transformer models. For the linear layer, we propose a new 2PC paradigm along with an encoding approach to securely compute matrix multiplications based on an outer-product insight, which achieves $2.9\times \sim 12.5\times$ performance improvements compared to the state-of-the-art (SOTA) protocol. For the non-linear layer, through a new observation of utilizing the input distribution, we propose an approach of low-degree polynomial approximation for $\mathsf{GELU}$ and $\mathsf{Softmax}$, which improves the performance of the SOTA polynomial approximation by $2.9\times \sim 4.0\times$, where the average accuracy loss of our approach is 0.08\% compared to the non-2PC inference without privacy. Compared with the SOTA two-party inference, $\mathsf{Nimbus}$ improves the end-to-end performance of \bert{} inference by $2.7\times \sim 4.7\times$ across different network settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15707v1-abstract-full').style.display = 'none'; document.getElementById('2411.15707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13547">arXiv:2411.13547</a> <span> [<a href="https://arxiv.org/pdf/2411.13547">pdf</a>, <a href="https://arxiv.org/format/2411.13547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> SpecTool: A Benchmark for Characterizing Errors in Tool-Use LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Awalgaonkar%2C+T">Tulika Awalgaonkar</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Prabhakar%2C+A">Akshara Prabhakar</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silivo Savarese</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13547v1-abstract-short" style="display: inline;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13547v1-abstract-full" style="display: none;"> Evaluating the output of Large Language Models (LLMs) is one of the most critical aspects of building a performant compound AI system. Since the output from LLMs propagate to downstream steps, identifying LLM errors is crucial to system performance. A common task for LLMs in AI systems is tool use. While there are several benchmark environments for evaluating LLMs on this task, they typically only give a success rate without any explanation of the failure cases. To solve this problem, we introduce SpecTool, a new benchmark to identify error patterns in LLM output on tool-use tasks. Our benchmark data set comprises of queries from diverse environments that can be used to test for the presence of seven newly characterized error patterns. Using SPECTOOL , we show that even the most prominent LLMs exhibit these error patterns in their outputs. Researchers can use the analysis and insights from SPECTOOL to guide their error mitigation strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13547v1-abstract-full').style.display = 'none'; document.getElementById('2411.13547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07104">arXiv:2411.07104</a> <span> [<a href="https://arxiv.org/pdf/2411.07104">pdf</a>, <a href="https://arxiv.org/format/2411.07104">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> Learning Multi-Agent Loco-Manipulation for Long-Horizon Quadrupedal Pushing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuming Feng</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+C">Chuye Hong</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Y">Yaru Niu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shiqi Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuxiang Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Ding Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07104v2-abstract-short" style="display: inline;"> Recently, quadrupedal locomotion has achieved significant success, but their manipulation capabilities, particularly in handling large objects, remain limited, restricting their usefulness in demanding real-world applications such as search and rescue, construction, industrial automation, and room organization. This paper tackles the task of obstacle-aware, long-horizon pushing by multiple quadrup… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07104v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07104v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07104v2-abstract-full" style="display: none;"> Recently, quadrupedal locomotion has achieved significant success, but their manipulation capabilities, particularly in handling large objects, remain limited, restricting their usefulness in demanding real-world applications such as search and rescue, construction, industrial automation, and room organization. This paper tackles the task of obstacle-aware, long-horizon pushing by multiple quadrupedal robots. We propose a hierarchical multi-agent reinforcement learning framework with three levels of control. The high-level controller integrates an RRT planner and a centralized adaptive policy to generate subgoals, while the mid-level controller uses a decentralized goal-conditioned policy to guide the robots toward these sub-goals. A pre-trained low-level locomotion policy executes the movement commands. We evaluate our method against several baselines in simulation, demonstrating significant improvements over baseline approaches, with 36.0% higher success rates and 24.5% reduction in completion time than the best baseline. Our framework successfully enables long-horizon, obstacle-aware manipulation tasks like Push-Cuboid and Push-T on Go1 robots in the real world. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07104v2-abstract-full').style.display = 'none'; document.getElementById('2411.07104v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03707">arXiv:2411.03707</a> <span> [<a href="https://arxiv.org/pdf/2411.03707">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fine-Tuning Vision-Language Model for Automated Engineering Drawing Information Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khan%2C+M+T">Muhammad Tayyab Khan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lequn Chen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+Y+H">Ye Han Ng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenhe Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+N+Y+J">Nicholas Yew Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+S+K">Seung Ki Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03707v1-abstract-short" style="display: inline;"> Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in manufacturing by defining acceptable variations in part features to ensure component quality and functionality. However, extracting GD&T information from 2D engineering drawings is a time-consuming and labor-intensive task, often relying on manual efforts or semi-automated tools. To address these challenges, this study proposes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03707v1-abstract-full" style="display: none;"> Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in manufacturing by defining acceptable variations in part features to ensure component quality and functionality. However, extracting GD&T information from 2D engineering drawings is a time-consuming and labor-intensive task, often relying on manual efforts or semi-automated tools. To address these challenges, this study proposes an automated and computationally efficient GD&T extraction method by fine-tuning Florence-2, an open-source vision-language model (VLM). The model is trained on a dataset of 400 drawings with ground truth annotations provided by domain experts. For comparison, two state-of-the-art closed-source VLMs, GPT-4o and Claude-3.5-Sonnet, are evaluated on the same dataset. All models are assessed using precision, recall, F1-score, and hallucination metrics. Due to the computational cost and impracticality of fine-tuning large closed-source VLMs for domain-specific tasks, GPT-4o and Claude-3.5-Sonnet are evaluated in a zero-shot setting. In contrast, Florence-2, a smaller model with 0.23 billion parameters, is optimized through full-parameter fine-tuning across three distinct experiments, each utilizing datasets augmented to different levels. The results show that Florence-2 achieves a 29.95% increase in precision, a 37.75% increase in recall, a 52.40% improvement in F1-score, and a 43.15% reduction in hallucination rate compared to the best-performing closed-source model. These findings highlight the effectiveness of fine-tuning smaller, open-source VLMs like Florence-2, offering a practical and efficient solution for automated GD&T extraction to support downstream manufacturing tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03707v1-abstract-full').style.display = 'none'; document.getElementById('2411.03707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper has been submitted to the 9th International Conference on Innovation in Artificial Intelligence (ICIAI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02959">arXiv:2411.02959</a> <span> [<a href="https://arxiv.org/pdf/2411.02959">pdf</a>, <a href="https://arxiv.org/format/2411.02959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696410.3714546">10.1145/3696410.3714546 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HtmlRAG: HTML is Better Than Plain Text for Modeling Retrieved Knowledge in RAG Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiejun Tan</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhicheng Dou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wen Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mang Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02959v2-abstract-short" style="display: inline;"> Retrieval-Augmented Generation (RAG) has been shown to improve knowledge capabilities and alleviate the hallucination problem of LLMs. The Web is a major source of external knowledge used in RAG systems, and many commercial RAG systems have used Web search engines as their major retrieval systems. Typically, such RAG systems retrieve search results, download HTML sources of the results, and then e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02959v2-abstract-full').style.display = 'inline'; document.getElementById('2411.02959v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02959v2-abstract-full" style="display: none;"> Retrieval-Augmented Generation (RAG) has been shown to improve knowledge capabilities and alleviate the hallucination problem of LLMs. The Web is a major source of external knowledge used in RAG systems, and many commercial RAG systems have used Web search engines as their major retrieval systems. Typically, such RAG systems retrieve search results, download HTML sources of the results, and then extract plain texts from the HTML sources. Plain text documents or chunks are fed into the LLMs to augment the generation. However, much of the structural and semantic information inherent in HTML, such as headings and table structures, is lost during this plain-text-based RAG process. To alleviate this problem, we propose HtmlRAG, which uses HTML instead of plain text as the format of retrieved knowledge in RAG. We believe HTML is better than plain text in modeling knowledge in external documents, and most LLMs possess robust capacities to understand HTML. However, utilizing HTML presents new challenges. HTML contains additional content such as tags, JavaScript, and CSS specifications, which bring extra input tokens and noise to the RAG system. To address this issue, we propose HTML cleaning, compression, and a two-step block-tree-based pruning strategy, to shorten the HTML while minimizing the loss of information. Experiments on six QA datasets confirm the superiority of using HTML in RAG systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02959v2-abstract-full').style.display = 'none'; document.getElementById('2411.02959v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by WWW 2025 main conference. Repo: https://github.com/plageon/HtmlRAG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02810">arXiv:2411.02810</a> <span> [<a href="https://arxiv.org/pdf/2411.02810">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Vision-Language Models for Manufacturing Feature Recognition in CAD Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khan%2C+M+T">Muhammad Tayyab Khan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lequn Chen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+Y+H">Ye Han Ng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenhe Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+N+Y+J">Nicholas Yew Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+S+K">Seung Ki Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02810v1-abstract-short" style="display: inline;"> Automatic feature recognition (AFR) is essential for transforming design knowledge into actionable manufacturing information. Traditional AFR methods, which rely on predefined geometric rules and large datasets, are often time-consuming and lack generalizability across various manufacturing features. To address these challenges, this study investigates vision-language models (VLMs) for automating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02810v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02810v1-abstract-full" style="display: none;"> Automatic feature recognition (AFR) is essential for transforming design knowledge into actionable manufacturing information. Traditional AFR methods, which rely on predefined geometric rules and large datasets, are often time-consuming and lack generalizability across various manufacturing features. To address these challenges, this study investigates vision-language models (VLMs) for automating the recognition of a wide range of manufacturing features in CAD designs without the need for extensive training datasets or predefined rules. Instead, prompt engineering techniques, such as multi-view query images, few-shot learning, sequential reasoning, and chain-of-thought, are applied to enable recognition. The approach is evaluated on a newly developed CAD dataset containing designs of varying complexity relevant to machining, additive manufacturing, sheet metal forming, molding, and casting. Five VLMs, including three closed-source models (GPT-4o, Claude-3.5-Sonnet, and Claude-3.0-Opus) and two open-source models (LLava and MiniCPM), are evaluated on this dataset with ground truth features labelled by experts. Key metrics include feature quantity accuracy, feature name matching accuracy, hallucination rate, and mean absolute error (MAE). Results show that Claude-3.5-Sonnet achieves the highest feature quantity accuracy (74%) and name-matching accuracy (75%) with the lowest MAE (3.2), while GPT-4o records the lowest hallucination rate (8%). In contrast, open-source models have higher hallucination rates (>30%) and lower accuracies (<40%). This study demonstrates the potential of VLMs to automate feature recognition in CAD designs within diverse manufacturing scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02810v1-abstract-full').style.display = 'none'; document.getElementById('2411.02810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper has been submitted to The ASME Journal of Computing and Information Science in Engineering (JCISE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02307">arXiv:2411.02307</a> <span> [<a href="https://arxiv.org/pdf/2411.02307">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Can Personalized Medicine Coexist with Health Equity? Examining the Cost Barrier and Ethical Implications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Francisco%2C+K+K+Y">Kishi Kobe Yee Francisco</a>, <a href="/search/cs?searchtype=author&query=Apuhin%2C+A+E+C">Andrane Estelle Carnicer Apuhin</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Byers%2C+M+C">Mickael Cavanaugh Byers</a>, <a href="/search/cs?searchtype=author&query=Maravilla%2C+N+M+A+T">Nicholle Mae Amor Tan Maravilla</a>, <a href="/search/cs?searchtype=author&query=Karim%2C+H+A">Hezerul Abdul Karim</a>, <a href="/search/cs?searchtype=author&query=AlDahoul%2C+N">Nouar AlDahoul</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02307v1-abstract-short" style="display: inline;"> Personalized medicine (PM) promises to transform healthcare by providing treatments tailored to individual genetic, environmental, and lifestyle factors. However, its high costs and infrastructure demands raise concerns about exacerbating health disparities, especially between high-income countries (HICs) and low- and middle-income countries (LMICs). While HICs benefit from advanced PM application… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02307v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02307v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02307v1-abstract-full" style="display: none;"> Personalized medicine (PM) promises to transform healthcare by providing treatments tailored to individual genetic, environmental, and lifestyle factors. However, its high costs and infrastructure demands raise concerns about exacerbating health disparities, especially between high-income countries (HICs) and low- and middle-income countries (LMICs). While HICs benefit from advanced PM applications through AI and genomics, LMICs often lack the resources necessary to adopt these innovations, leading to a widening healthcare divide. This paper explores the financial and ethical challenges of PM implementation, with a focus on ensuring equitable access. It proposes strategies for global collaboration, infrastructure development, and ethical frameworks to support LMICs in adopting PM, aiming to prevent further disparities in healthcare accessibility and outcomes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02307v1-abstract-full').style.display = 'none'; document.getElementById('2411.02307v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.24148">arXiv:2410.24148</a> <span> [<a href="https://arxiv.org/pdf/2410.24148">pdf</a>, <a href="https://arxiv.org/format/2410.24148">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploring Vision Language Models for Facial Attribute Recognition: Emotion, Race, Gender, and Age </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=AlDahoul%2C+N">Nouar AlDahoul</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+M+J+T">Myles Joshua Toledo Tan</a>, <a href="/search/cs?searchtype=author&query=Kasireddy%2C+H+R">Harishwar Reddy Kasireddy</a>, <a href="/search/cs?searchtype=author&query=Zaki%2C+Y">Yasir Zaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.24148v1-abstract-short" style="display: inline;"> Technologies for recognizing facial attributes like race, gender, age, and emotion have several applications, such as surveillance, advertising content, sentiment analysis, and the study of demographic trends and social behaviors. Analyzing demographic characteristics based on images and analyzing facial expressions have several challenges due to the complexity of humans' facial attributes. Tradit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24148v1-abstract-full').style.display = 'inline'; document.getElementById('2410.24148v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.24148v1-abstract-full" style="display: none;"> Technologies for recognizing facial attributes like race, gender, age, and emotion have several applications, such as surveillance, advertising content, sentiment analysis, and the study of demographic trends and social behaviors. Analyzing demographic characteristics based on images and analyzing facial expressions have several challenges due to the complexity of humans' facial attributes. Traditional approaches have employed CNNs and various other deep learning techniques, trained on extensive collections of labeled images. While these methods demonstrated effective performance, there remains potential for further enhancements. In this paper, we propose to utilize vision language models (VLMs) such as generative pre-trained transformer (GPT), GEMINI, large language and vision assistant (LLAVA), PaliGemma, and Microsoft Florence2 to recognize facial attributes such as race, gender, age, and emotion from images with human faces. Various datasets like FairFace, AffectNet, and UTKFace have been utilized to evaluate the solutions. The results show that VLMs are competitive if not superior to traditional techniques. Additionally, we propose "FaceScanPaliGemma"--a fine-tuned PaliGemma model--for race, gender, age, and emotion recognition. The results show an accuracy of 81.1%, 95.8%, 80%, and 59.4% for race, gender, age group, and emotion classification, respectively, outperforming pre-trained version of PaliGemma, other VLMs, and SotA methods. Finally, we propose "FaceScanGPT", which is a GPT-4o model to recognize the above attributes when several individuals are present in the image using a prompt engineered for a person with specific facial and/or physical attributes. The results underscore the superior multitasking capability of FaceScanGPT to detect the individual's attributes like hair cut, clothing color, postures, etc., using only a prompt to drive the detection and recognition tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.24148v1-abstract-full').style.display = 'none'; document.getElementById('2410.24148v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">52 pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20314">arXiv:2410.20314</a> <span> [<a href="https://arxiv.org/pdf/2410.20314">pdf</a>, <a href="https://arxiv.org/format/2410.20314">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> Wavelet-based Mamba with Fourier Adjustment for Low-light Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Junhao Tan</a>, <a href="/search/cs?searchtype=author&query=Pei%2C+S">Songwen Pei</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+W">Wei Qin</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bo Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Ximing Li</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Libo Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20314v1-abstract-short" style="display: inline;"> Frequency information (e.g., Discrete Wavelet Transform and Fast Fourier Transform) has been widely applied to solve the issue of Low-Light Image Enhancement (LLIE). However, existing frequency-based models primarily operate in the simple wavelet or Fourier space of images, which lacks utilization of valid global and local information in each space. We found that wavelet frequency information is m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20314v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20314v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20314v1-abstract-full" style="display: none;"> Frequency information (e.g., Discrete Wavelet Transform and Fast Fourier Transform) has been widely applied to solve the issue of Low-Light Image Enhancement (LLIE). However, existing frequency-based models primarily operate in the simple wavelet or Fourier space of images, which lacks utilization of valid global and local information in each space. We found that wavelet frequency information is more sensitive to global brightness due to its low-frequency component while Fourier frequency information is more sensitive to local details due to its phase component. In order to achieve superior preliminary brightness enhancement by optimally integrating spatial channel information with low-frequency components in the wavelet transform, we introduce channel-wise Mamba, which compensates for the long-range dependencies of CNNs and has lower complexity compared to Diffusion and Transformer models. So in this work, we propose a novel Wavelet-based Mamba with Fourier Adjustment model called WalMaFa, consisting of a Wavelet-based Mamba Block (WMB) and a Fast Fourier Adjustment Block (FFAB). We employ an Encoder-Latent-Decoder structure to accomplish the end-to-end transformation. Specifically, WMB is adopted in the Encoder and Decoder to enhance global brightness while FFAB is adopted in the Latent to fine-tune local texture details and alleviate ambiguity. Extensive experiments demonstrate that our proposed WalMaFa achieves state-of-the-art performance with fewer computational resources and faster speed. Code is now available at: https://github.com/mcpaulgeorge/WalMaFa. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20314v1-abstract-full').style.display = 'none'; document.getElementById('2410.20314v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 8 figures, ACCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19728">arXiv:2410.19728</a> <span> [<a href="https://arxiv.org/pdf/2410.19728">pdf</a>, <a href="https://arxiv.org/format/2410.19728">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="High Energy Physics - Theory">hep-th</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="High Energy Physics - Phenomenology">hep-ph</span> </div> </div> <p class="title is-5 mathjax"> cymyc -- Calabi-Yau Metrics, Yukawas, and Curvature </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Berglund%2C+P">Per Berglund</a>, <a href="/search/cs?searchtype=author&query=Butbaia%2C+G">Giorgi Butbaia</a>, <a href="/search/cs?searchtype=author&query=H%C3%BCbsch%2C+T">Tristan H眉bsch</a>, <a href="/search/cs?searchtype=author&query=Jejjala%2C+V">Vishnu Jejjala</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+C">Challenger Mishra</a>, <a href="/search/cs?searchtype=author&query=Pe%C3%B1a%2C+D+M">Dami谩n Mayorga Pe帽a</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Justin Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19728v1-abstract-short" style="display: inline;"> We introduce \texttt{cymyc}, a high-performance Python library for numerical investigation of the geometry of a large class of string compactification manifolds and their associated moduli spaces. We develop a well-defined geometric ansatz to numerically model tensor fields of arbitrary degree on a large class of Calabi-Yau manifolds. \texttt{cymyc} includes a machine learning component which inco… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19728v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19728v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19728v1-abstract-full" style="display: none;"> We introduce \texttt{cymyc}, a high-performance Python library for numerical investigation of the geometry of a large class of string compactification manifolds and their associated moduli spaces. We develop a well-defined geometric ansatz to numerically model tensor fields of arbitrary degree on a large class of Calabi-Yau manifolds. \texttt{cymyc} includes a machine learning component which incorporates this ansatz to model tensor fields of interest on these spaces by finding an approximate solution to the system of partial differential equations they should satisfy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19728v1-abstract-full').style.display = 'none'; document.getElementById('2410.19728v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">35 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19697">arXiv:2410.19697</a> <span> [<a href="https://arxiv.org/pdf/2410.19697">pdf</a>, <a href="https://arxiv.org/format/2410.19697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> IPPON: Common Sense Guided Informative Path Planning for Object Goal Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qu%2C+K">Kaixian Qu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+F">Fei Xia</a>, <a href="/search/cs?searchtype=author&query=Cadena%2C+C">Cesar Cadena</a>, <a href="/search/cs?searchtype=author&query=Hutter%2C+M">Marco Hutter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19697v1-abstract-short" style="display: inline;"> Navigating efficiently to an object in an unexplored environment is a critical skill for general-purpose intelligent robots. Recent approaches to this object goal navigation problem have embraced a modular strategy, integrating classical exploration algorithms-notably frontier exploration-with a learned semantic mapping/exploration module. This paper introduces a novel informative path planning an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19697v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19697v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19697v1-abstract-full" style="display: none;"> Navigating efficiently to an object in an unexplored environment is a critical skill for general-purpose intelligent robots. Recent approaches to this object goal navigation problem have embraced a modular strategy, integrating classical exploration algorithms-notably frontier exploration-with a learned semantic mapping/exploration module. This paper introduces a novel informative path planning and 3D object probability mapping approach. The mapping module computes the probability of the object of interest through semantic segmentation and a Bayes filter. Additionally, it stores probabilities for common objects, which semantically guides the exploration based on common sense priors from a large language model. The planner terminates when the current viewpoint captures enough voxels identified with high confidence as the object of interest. Although our planner follows a zero-shot approach, it achieves state-of-the-art performance as measured by the Success weighted by Path Length (SPL) and Soft SPL in the Habitat ObjectNav Challenge 2023, outperforming other works by more than 20%. Furthermore, we validate its effectiveness on real robots. Project webpage: https://ippon-paper.github.io/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19697v1-abstract-full').style.display = 'none'; document.getElementById('2410.19697v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18528">arXiv:2410.18528</a> <span> [<a href="https://arxiv.org/pdf/2410.18528">pdf</a>, <a href="https://arxiv.org/format/2410.18528">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PRACT: Optimizing Principled Reasoning and Acting of LLM Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhiwei Liu</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+W">Weiran Yao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianguo Zhang</a>, <a href="/search/cs?searchtype=author&query=Murthy%2C+R">Rithesh Murthy</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liangwei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zuxin Liu</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+T">Tian Lan</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Ming Zhu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Juntao Tan</a>, <a href="/search/cs?searchtype=author&query=Kokane%2C+S">Shirley Kokane</a>, <a href="/search/cs?searchtype=author&query=Hoang%2C+T">Thai Hoang</a>, <a href="/search/cs?searchtype=author&query=Niebles%2C+J+C">Juan Carlos Niebles</a>, <a href="/search/cs?searchtype=author&query=Heinecke%2C+S">Shelby Heinecke</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Huan Wang</a>, <a href="/search/cs?searchtype=author&query=Savarese%2C+S">Silvio Savarese</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18528v1-abstract-short" style="display: inline;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18528v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18528v1-abstract-full" style="display: none;"> We introduce the Principled Reasoning and Acting (PRAct) framework, a novel method for learning and enforcing action principles from trajectory data. Central to our approach is the use of text gradients from a reflection and optimization engine to derive these action principles. To adapt action principles to specific task requirements, we propose a new optimization framework, Reflective Principle Optimization (RPO). After execution, RPO employs a reflector to critique current action principles and an optimizer to update them accordingly. We develop the RPO framework under two scenarios: Reward-RPO, which uses environmental rewards for reflection, and Self-RPO, which conducts self-reflection without external rewards. Additionally, two RPO methods, RPO-Traj and RPO-Batch, is introduced to adapt to different settings. Experimental results across four environments demonstrate that the PRAct agent, leveraging the RPO framework, effectively learns and applies action principles to enhance performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18528v1-abstract-full').style.display = 'none'; document.getElementById('2410.18528v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to SIG CoNLL 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12247">arXiv:2410.12247</a> <span> [<a href="https://arxiv.org/pdf/2410.12247">pdf</a>, <a href="https://arxiv.org/format/2410.12247">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> EPS-MoE: Expert Pipeline Scheduler for Cost-Efficient MoE Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Y">Yulei Qian</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fengcun Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xiaoyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jianchao Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kefeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xunliang Cai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12247v2-abstract-short" style="display: inline;"> The Mixture-of-Experts (MoE) model has emerged as a prominent architecture in the field of Large Language Models (LLMs), providing a better balance between model performance and computational efficiency. However the General Matrix Multiply (GEMM) operations and large parameters introduce challenges related to computational efficiency and communication overhead, which become throughput bottlenecks… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12247v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12247v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12247v2-abstract-full" style="display: none;"> The Mixture-of-Experts (MoE) model has emerged as a prominent architecture in the field of Large Language Models (LLMs), providing a better balance between model performance and computational efficiency. However the General Matrix Multiply (GEMM) operations and large parameters introduce challenges related to computational efficiency and communication overhead, which become throughput bottlenecks during inference. Applying a single parallelism strategy like EP, DP, TP or a straightforward combination of them to MoE usually achieves sub-optimal inference throughput. This paper introduces EPS-MoE, a novel expert pipeline scheduler for MoE that surpasses the existing parallelism schemes. Our approach optimizes the computation of MoE FeedForward Network (FFN) modules by dynamically selecting the best kernel implementation of GroupGemm and DenseGemm for different loads and adaptively overlapping these computations with communication, leading to a substantial increase in throughput. Our experimental results demonstrate at most 52.4\% improvement in prefill throughput compared to existing parallel inference methods. Specifically, our method accelerated the highly optimized DeepSeekV2 model from a claimed 100K tokens per second to at least 120K tokens per second. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12247v2-abstract-full').style.display = 'none'; document.getElementById('2410.12247v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11719">arXiv:2410.11719</a> <span> [<a href="https://arxiv.org/pdf/2410.11719">pdf</a>, <a href="https://arxiv.org/format/2410.11719">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Coordinators and Prompts on Heterogeneous Graphs for Cross-Domain Recommendations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hengyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+C">Chunxu Shen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+X">Xiangguo Sun</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Piao%2C+C">Chengzhi Piao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+H">Hong Cheng</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+L">Lingling Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11719v1-abstract-short" style="display: inline;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data spa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11719v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11719v1-abstract-full" style="display: none;"> In the online digital world, users frequently engage with diverse items across multiple domains (e.g., e-commerce platforms, streaming services, and social media networks), forming complex heterogeneous interaction graphs. Leveraging this multi-domain information can undoubtedly enhance the performance of recommendation systems by providing more comprehensive user insights and alleviating data sparsity in individual domains. However, integrating multi-domain knowledge for the cross-domain recommendation is very hard due to inherent disparities in user behavior and item characteristics and the risk of negative transfer, where irrelevant or conflicting information from the source domains adversely impacts the target domain's performance. To address these challenges, we offer HAGO, a novel framework with $\textbf{H}$eterogeneous $\textbf{A}$daptive $\textbf{G}$raph co$\textbf{O}$rdinators, which dynamically integrate multi-domain graphs into a cohesive structure by adaptively adjusting the connections between coordinators and multi-domain graph nodes, thereby enhancing beneficial inter-domain interactions while mitigating negative transfer effects. Additionally, we develop a universal multi-domain graph pre-training strategy alongside HAGO to collaboratively learn high-quality node representations across domains. To effectively transfer the learned multi-domain knowledge to the target domain, we design an effective graph prompting method, which incorporates pre-trained embeddings with learnable prompts for the recommendation task. Our framework is compatible with various graph-based models and pre-training techniques, demonstrating broad applicability and effectiveness. Further experimental results show that our solutions outperform state-of-the-art methods in multi-domain recommendation scenarios and highlight their potential for real-world applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11719v1-abstract-full').style.display = 'none'; document.getElementById('2410.11719v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Under review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08058">arXiv:2410.08058</a> <span> [<a href="https://arxiv.org/pdf/2410.08058">pdf</a>, <a href="https://arxiv.org/format/2410.08058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Closing the Loop: Learning to Generate Writing Feedback via Language Model Simulated Student Revisions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nair%2C+I">Inderjeet Nair</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jiaye Tan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+X">Xiaotian Su</a>, <a href="/search/cs?searchtype=author&query=Gere%2C+A">Anne Gere</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xu Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lu Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08058v1-abstract-short" style="display: inline;"> Providing feedback is widely recognized as crucial for refining students' writing skills. Recent advances in language models (LMs) have made it possible to automatically generate feedback that is actionable and well-aligned with human-specified attributes. However, it remains unclear whether the feedback generated by these models is truly effective in enhancing the quality of student revisions. Mo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08058v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08058v1-abstract-full" style="display: none;"> Providing feedback is widely recognized as crucial for refining students' writing skills. Recent advances in language models (LMs) have made it possible to automatically generate feedback that is actionable and well-aligned with human-specified attributes. However, it remains unclear whether the feedback generated by these models is truly effective in enhancing the quality of student revisions. Moreover, prompting LMs with a precise set of instructions to generate feedback is nontrivial due to the lack of consensus regarding the specific attributes that can lead to improved revising performance. To address these challenges, we propose PROF that PROduces Feedback via learning from LM simulated student revisions. PROF aims to iteratively optimize the feedback generator by directly maximizing the effectiveness of students' overall revising performance as simulated by LMs. Focusing on an economic essay assignment, we empirically test the efficacy of PROF and observe that our approach not only surpasses a variety of baseline methods in effectiveness of improving students' writing but also demonstrates enhanced pedagogical values, even though it was not explicitly trained for this aspect. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08058v1-abstract-full').style.display = 'none'; document.getElementById('2410.08058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to EMNLP 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.07053">arXiv:2410.07053</a> <span> [<a href="https://arxiv.org/pdf/2410.07053">pdf</a>, <a href="https://arxiv.org/format/2410.07053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Robots in the Middle: Evaluating LLMs in Dispute Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jinzhe Tan</a>, <a href="/search/cs?searchtype=author&query=Westermann%2C+H">Hannes Westermann</a>, <a href="/search/cs?searchtype=author&query=Pottanigari%2C+N+R">Nikhil Reddy Pottanigari</a>, <a href="/search/cs?searchtype=author&query=%C5%A0avelka%2C+J">Jarom铆r 艩avelka</a>, <a href="/search/cs?searchtype=author&query=Mee%C3%B9s%2C+S">S茅bastien Mee霉s</a>, <a href="/search/cs?searchtype=author&query=Godet%2C+M">Mia Godet</a>, <a href="/search/cs?searchtype=author&query=Benyekhlef%2C+K">Karim Benyekhlef</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.07053v1-abstract-short" style="display: inline;"> Mediation is a dispute resolution method featuring a neutral third-party (mediator) who intervenes to help the individuals resolve their dispute. In this paper, we investigate to which extent large language models (LLMs) are able to act as mediators. We investigate whether LLMs are able to analyze dispute conversations, select suitable intervention types, and generate appropriate intervention mess… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07053v1-abstract-full').style.display = 'inline'; document.getElementById('2410.07053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.07053v1-abstract-full" style="display: none;"> Mediation is a dispute resolution method featuring a neutral third-party (mediator) who intervenes to help the individuals resolve their dispute. In this paper, we investigate to which extent large language models (LLMs) are able to act as mediators. We investigate whether LLMs are able to analyze dispute conversations, select suitable intervention types, and generate appropriate intervention messages. Using a novel, manually created dataset of 50 dispute scenarios, we conduct a blind evaluation comparing LLMs with human annotators across several key metrics. Overall, the LLMs showed strong performance, even outperforming our human annotators across dimensions. Specifically, in 62% of the cases, the LLMs chose intervention types that were rated as better than or equivalent to those chosen by humans. Moreover, in 84% of the cases, the intervention messages generated by the LLMs were rated as better than or equal to the intervention messages written by humans. LLMs likewise performed favourably on metrics such as impartiality, understanding and contextualization. Our results demonstrate the potential of integrating AI in online dispute resolution (ODR) platforms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.07053v1-abstract-full').style.display = 'none'; document.getElementById('2410.07053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.01351">arXiv:2410.01351</a> <span> [<a href="https://arxiv.org/pdf/2410.01351">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Other Quantitative Biology">q-bio.OT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Learning and teaching biological data science in the Bioconductor community </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Drnevich%2C+J">Jenny Drnevich</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+F+J">Frederick J. Tan</a>, <a href="/search/cs?searchtype=author&query=Almeida-Silva%2C+F">Fabricio Almeida-Silva</a>, <a href="/search/cs?searchtype=author&query=Castelo%2C+R">Robert Castelo</a>, <a href="/search/cs?searchtype=author&query=Culhane%2C+A+C">Aedin C. Culhane</a>, <a href="/search/cs?searchtype=author&query=Davis%2C+S">Sean Davis</a>, <a href="/search/cs?searchtype=author&query=Doyle%2C+M+A">Maria A. Doyle</a>, <a href="/search/cs?searchtype=author&query=Holmes%2C+S">Susan Holmes</a>, <a href="/search/cs?searchtype=author&query=Lahti%2C+L">Leo Lahti</a>, <a href="/search/cs?searchtype=author&query=Mahmoud%2C+A">Alexandru Mahmoud</a>, <a href="/search/cs?searchtype=author&query=Nishida%2C+K">Kozo Nishida</a>, <a href="/search/cs?searchtype=author&query=Ramos%2C+M">Marcel Ramos</a>, <a href="/search/cs?searchtype=author&query=Rue-Albrecht%2C+K">Kevin Rue-Albrecht</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+D+J+H">David J. H. Shih</a>, <a href="/search/cs?searchtype=author&query=Gatto%2C+L">Laurent Gatto</a>, <a href="/search/cs?searchtype=author&query=Soneson%2C+C">Charlotte Soneson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.01351v1-abstract-short" style="display: inline;"> Modern biological research is increasingly data-intensive, leading to a growing demand for effective training in biological data science. In this article, we provide an overview of key resources and best practices available within the Bioconductor project - an open-source software community focused on omics data analysis. This guide serves as a valuable reference for both learners and educators in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01351v1-abstract-full').style.display = 'inline'; document.getElementById('2410.01351v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.01351v1-abstract-full" style="display: none;"> Modern biological research is increasingly data-intensive, leading to a growing demand for effective training in biological data science. In this article, we provide an overview of key resources and best practices available within the Bioconductor project - an open-source software community focused on omics data analysis. This guide serves as a valuable reference for both learners and educators in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.01351v1-abstract-full').style.display = 'none'; document.getElementById('2410.01351v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 2 figures, 1 table, 1 supplemental table</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 97K80 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> K.3.2 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20563">arXiv:2409.20563</a> <span> [<a href="https://arxiv.org/pdf/2409.20563">pdf</a>, <a href="https://arxiv.org/format/2409.20563">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DressRecon: Freeform 4D Human Reconstruction from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jeff Tan</a>, <a href="/search/cs?searchtype=author&query=Xiang%2C+D">Donglai Xiang</a>, <a href="/search/cs?searchtype=author&query=Tulsiani%2C+S">Shubham Tulsiani</a>, <a href="/search/cs?searchtype=author&query=Ramanan%2C+D">Deva Ramanan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+G">Gengshan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20563v2-abstract-short" style="display: inline;"> We present a method to reconstruct time-consistent human body models from monocular videos, focusing on extremely loose clothing or handheld object interactions. Prior work in human reconstruction is either limited to tight clothing with no object interactions, or requires calibrated multi-view captures or personalized template scans which are costly to collect at scale. Our key insight for high-q… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20563v2-abstract-full').style.display = 'inline'; document.getElementById('2409.20563v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20563v2-abstract-full" style="display: none;"> We present a method to reconstruct time-consistent human body models from monocular videos, focusing on extremely loose clothing or handheld object interactions. Prior work in human reconstruction is either limited to tight clothing with no object interactions, or requires calibrated multi-view captures or personalized template scans which are costly to collect at scale. Our key insight for high-quality yet flexible reconstruction is the careful combination of generic human priors about articulated body shape (learned from large-scale training data) with video-specific articulated "bag-of-bones" deformation (fit to a single video via test-time optimization). We accomplish this by learning a neural implicit model that disentangles body versus clothing deformations as separate motion model layers. To capture subtle geometry of clothing, we leverage image-based priors such as human body pose, surface normals, and optical flow during optimization. The resulting neural fields can be extracted into time-consistent meshes, or further optimized as explicit 3D Gaussians for high-fidelity interactive rendering. On datasets with highly challenging clothing deformations and object interactions, DressRecon yields higher-fidelity 3D reconstructions than prior art. Project page: https://jefftan969.github.io/dressrecon/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20563v2-abstract-full').style.display = 'none'; document.getElementById('2409.20563v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://jefftan969.github.io/dressrecon/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16081">arXiv:2409.16081</a> <span> [<a href="https://arxiv.org/pdf/2409.16081">pdf</a>, <a href="https://arxiv.org/ps/2409.16081">ps</a>, <a href="https://arxiv.org/format/2409.16081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Online Multi-level Contrastive Representation Distillation for Cross-Subject fNIRS Emotion Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zhili Lai</a>, <a href="/search/cs?searchtype=author&query=Qing%2C+C">Chunmei Qing</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Junpeng Tan</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+W">Wanxiang Luo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiangmin Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16081v1-abstract-short" style="display: inline;"> Utilizing functional near-infrared spectroscopy (fNIRS) signals for emotion recognition is a significant advancement in understanding human emotions. However, due to the lack of artificial intelligence data and algorithms in this field, current research faces the following challenges: 1) The portable wearable devices have higher requirements for lightweight models; 2) The objective differences of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16081v1-abstract-full').style.display = 'inline'; document.getElementById('2409.16081v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16081v1-abstract-full" style="display: none;"> Utilizing functional near-infrared spectroscopy (fNIRS) signals for emotion recognition is a significant advancement in understanding human emotions. However, due to the lack of artificial intelligence data and algorithms in this field, current research faces the following challenges: 1) The portable wearable devices have higher requirements for lightweight models; 2) The objective differences of physiology and psychology among different subjects aggravate the difficulty of emotion recognition. To address these challenges, we propose a novel cross-subject fNIRS emotion recognition method, called the Online Multi-level Contrastive Representation Distillation framework (OMCRD). Specifically, OMCRD is a framework designed for mutual learning among multiple lightweight student networks. It utilizes multi-level fNIRS feature extractor for each sub-network and conducts multi-view sentimental mining using physiological signals. The proposed Inter-Subject Interaction Contrastive Representation (IS-ICR) facilitates knowledge transfer for interactions between student models, enhancing cross-subject emotion recognition performance. The optimal student network can be selected and deployed on a wearable device. Some experimental results demonstrate that OMCRD achieves state-of-the-art results in emotional perception and affective imagery tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16081v1-abstract-full').style.display = 'none'; document.getElementById('2409.16081v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted in ACMMM-2024 Workshop BCI. Codes are available at https://github.com/Lzhili/fNIRS-OMCRD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.15574">arXiv:2409.15574</a> <span> [<a href="https://arxiv.org/pdf/2409.15574">pdf</a>, <a href="https://arxiv.org/format/2409.15574">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Clinical-grade Multi-Organ Pathology Report Generation for Multi-scale Whole Slide Images via a Semantically Guided Medical Text Foundation Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+J+W">Jing Wei Tan</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+S">SeungKyu Kim</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+E">Eunsu Kim</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+S+H">Sung Hak Lee</a>, <a href="/search/cs?searchtype=author&query=Ahn%2C+S">Sangjeong Ahn</a>, <a href="/search/cs?searchtype=author&query=Jeong%2C+W">Won-Ki Jeong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.15574v1-abstract-short" style="display: inline;"> Vision language models (VLM) have achieved success in both natural language comprehension and image recognition tasks. However, their use in pathology report generation for whole slide images (WSIs) is still limited due to the huge size of multi-scale WSIs and the high cost of WSI annotation. Moreover, in most of the existing research on pathology report generation, sufficient validation regarding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15574v1-abstract-full').style.display = 'inline'; document.getElementById('2409.15574v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.15574v1-abstract-full" style="display: none;"> Vision language models (VLM) have achieved success in both natural language comprehension and image recognition tasks. However, their use in pathology report generation for whole slide images (WSIs) is still limited due to the huge size of multi-scale WSIs and the high cost of WSI annotation. Moreover, in most of the existing research on pathology report generation, sufficient validation regarding clinical efficacy has not been conducted. Herein, we propose a novel Patient-level Multi-organ Pathology Report Generation (PMPRG) model, which utilizes the multi-scale WSI features from our proposed multi-scale regional vision transformer (MR-ViT) model and their real pathology reports to guide VLM training for accurate pathology report generation. The model then automatically generates a report based on the provided key features attended regional features. We assessed our model using a WSI dataset consisting of multiple organs, including the colon and kidney. Our model achieved a METEOR score of 0.68, demonstrating the effectiveness of our approach. This model allows pathologists to efficiently generate pathology reports for patients, regardless of the number of WSIs involved. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.15574v1-abstract-full').style.display = 'none'; document.getElementById('2409.15574v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10923">arXiv:2409.10923</a> <span> [<a href="https://arxiv.org/pdf/2409.10923">pdf</a>, <a href="https://arxiv.org/format/2409.10923">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Agile Continuous Jumping in Discontinuous Terrains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yuxiang Yang</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Guanya Shi</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Changyi Lin</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+X">Xiangyun Meng</a>, <a href="/search/cs?searchtype=author&query=Scalise%2C+R">Rosario Scalise</a>, <a href="/search/cs?searchtype=author&query=Castro%2C+M+G">Mateo Guaman Castro</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenhao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tingnan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+D">Ding Zhao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+J">Jie Tan</a>, <a href="/search/cs?searchtype=author&query=Boots%2C+B">Byron Boots</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10923v2-abstract-short" style="display: inline;"> We focus on agile, continuous, and terrain-adaptive jumping of quadrupedal robots in discontinuous terrains such as stairs and stepping stones. Unlike single-step jumping, continuous jumping requires accurately executing highly dynamic motions over long horizons, which is challenging for existing approaches. To accomplish this task, we design a hierarchical learning and control framework, which co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10923v2-abstract-full').style.display = 'inline'; document.getElementById('2409.10923v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10923v2-abstract-full" style="display: none;"> We focus on agile, continuous, and terrain-adaptive jumping of quadrupedal robots in discontinuous terrains such as stairs and stepping stones. Unlike single-step jumping, continuous jumping requires accurately executing highly dynamic motions over long horizons, which is challenging for existing approaches. To accomplish this task, we design a hierarchical learning and control framework, which consists of a learned heightmap predictor for robust terrain perception, a reinforcement-learning-based centroidal-level motion policy for versatile and terrain-adaptive planning, and a low-level model-based leg controller for accurate motion tracking. In addition, we minimize the sim-to-real gap by accurately modeling the hardware characteristics. Our framework enables a Unitree Go1 robot to perform agile and continuous jumps on human-sized stairs and sparse stepping stones, for the first time to the best of our knowledge. In particular, the robot can cross two stair steps in each jump and completes a 3.5m long, 2.8m high, 14-step staircase in 4.5 seconds. Moreover, the same policy outperforms baselines in various other parkour tasks, such as jumping over single horizontal or vertical discontinuities. Experiment videos can be found at https://yxyang.github.io/jumping_cod/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10923v2-abstract-full').style.display = 'none'; document.getElementById('2409.10923v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: https://yxyang.github.io/jumping_cod/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04224">arXiv:2409.04224</a> <span> [<a href="https://arxiv.org/pdf/2409.04224">pdf</a>, <a href="https://arxiv.org/format/2409.04224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Advancing Multi-Organ Disease Care: A Hierarchical Multi-Agent Reinforcement Learning Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tan%2C+D+J">Daniel J. Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qianyi Xu</a>, <a href="/search/cs?searchtype=author&query=See%2C+K+C">Kay Choong See</a>, <a href="/search/cs?searchtype=author&query=Perera%2C+D">Dilruk Perera</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengling Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04224v1-abstract-short" style="display: inline;"> Multi-organ diseases present significant challenges due to their simultaneous impact on multiple organ systems, necessitating complex and adaptive treatment strategies. Despite recent advancements in AI-powered healthcare decision support systems, existing solutions are limited to individual organ systems. They often ignore the intricate dependencies between organ system and thereby fails to provi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04224v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04224v1-abstract-full" style="display: none;"> Multi-organ diseases present significant challenges due to their simultaneous impact on multiple organ systems, necessitating complex and adaptive treatment strategies. Despite recent advancements in AI-powered healthcare decision support systems, existing solutions are limited to individual organ systems. They often ignore the intricate dependencies between organ system and thereby fails to provide holistic treatment recommendations that are useful in practice. We propose a novel hierarchical multi-agent reinforcement learning (HMARL) framework to address these challenges. This framework uses dedicated agents for each organ system, and model dynamic through explicit inter-agent communication channels, enabling coordinated treatment strategies across organs. Furthermore, we introduce a dual-layer state representation technique to contextualize patient conditions at various hierarchical levels, enhancing the treatment accuracy and relevance. Through extensive qualitative and quantitative evaluations in managing sepsis (a complex multi-organ disease), our approach demonstrates its ability to learn effective treatment policies that significantly improve patient survival rates. This framework marks a substantial advancement in clinical decision support systems, pioneering a comprehensive approach for multi-organ treatment recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04224v1-abstract-full').style.display = 'none'; document.getElementById('2409.04224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Tan%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Tan%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository