Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,724 results for author: <span class="mathjax">Chen, T</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+T">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, T"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+T&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, T"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10277">arXiv:2502.10277</a> <span> [<a href="https://arxiv.org/pdf/2502.10277">pdf</a>, <a href="https://arxiv.org/format/2502.10277">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Artificial Intelligence to Assess Dental Findings from Panoramic Radiographs -- A Multinational Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y+C">Yin-Chih Chelsea Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tsao-Lun Chen</a>, <a href="/search/cs?searchtype=author&query=Vinayahalingam%2C+S">Shankeeth Vinayahalingam</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tai-Hsien Wu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C+W">Chu Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+H+H">Hsuan Hao Chang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+H">Hung-Jen Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mu-Hsiung Chen</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+C">Ching-Chang Ko</a>, <a href="/search/cs?searchtype=author&query=Moin%2C+D+A">David Anssari Moin</a>, <a href="/search/cs?searchtype=author&query=van+Ginneken%2C+B">Bram van Ginneken</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+T">Tong Xi</a>, <a href="/search/cs?searchtype=author&query=Tsai%2C+H">Hsiao-Cheng Tsai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Min-Huey Chen</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+T+H">Tzu-Ming Harry Hsu</a>, <a href="/search/cs?searchtype=author&query=Chou%2C+H">Hye Chou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10277v1-abstract-short" style="display: inline;"> Dental panoramic radiographs (DPRs) are widely used in clinical practice for comprehensive oral assessment but present challenges due to overlapping structures and time constraints in interpretation. This study aimed to establish a solid baseline for the AI-automated assessment of findings in DPRs by developing, evaluating an AI system, and comparing its performance with that of human readers ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10277v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10277v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10277v1-abstract-full" style="display: none;"> Dental panoramic radiographs (DPRs) are widely used in clinical practice for comprehensive oral assessment but present challenges due to overlapping structures and time constraints in interpretation. This study aimed to establish a solid baseline for the AI-automated assessment of findings in DPRs by developing, evaluating an AI system, and comparing its performance with that of human readers across multinational data sets. We analyzed 6,669 DPRs from three data sets (the Netherlands, Brazil, and Taiwan), focusing on 8 types of dental findings. The AI system combined object detection and semantic segmentation techniques for per-tooth finding identification. Performance metrics included sensitivity, specificity, and area under the receiver operating characteristic curve (AUC-ROC). AI generalizability was tested across data sets, and performance was compared with human dental practitioners. The AI system demonstrated comparable or superior performance to human readers, particularly +67.9% (95% CI: 54.0%-81.9%; p < .001) sensitivity for identifying periapical radiolucencies and +4.7% (95% CI: 1.4%-8.0%; p = .008) sensitivity for identifying missing teeth. The AI achieved a macro-averaged AUC-ROC of 96.2% (95% CI: 94.6%-97.8%) across 8 findings. AI agreements with the reference were comparable to inter-human agreements in 7 of 8 findings except for caries (p = .024). The AI system demonstrated robust generalization across diverse imaging and demographic settings and processed images 79 times faster (95% CI: 75-82) than human readers. The AI system effectively assessed findings in DPRs, achieving performance on par with or better than human experts while significantly reducing interpretation time. These results highlight the potential for integrating AI into clinical workflows to improve diagnostic efficiency and accuracy, and patient management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10277v1-abstract-full').style.display = 'none'; document.getElementById('2502.10277v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08808">arXiv:2502.08808</a> <span> [<a href="https://arxiv.org/pdf/2502.08808">pdf</a>, <a href="https://arxiv.org/format/2502.08808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A First-order Generative Bilevel Optimization Framework for Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Quan Xiao</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Hui Yuan</a>, <a href="/search/cs?searchtype=author&query=Saif%2C+A+F+M">A F M Saif</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+G">Gaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Kompella%2C+R">Ramana Kompella</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Mengdi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08808v1-abstract-short" style="display: inline;"> Diffusion models, which iteratively denoise data samples to synthesize high-quality outputs, have achieved empirical success across domains. However, optimizing these models for downstream tasks often involves nested bilevel structures, such as tuning hyperparameters for fine-tuning tasks or noise schedules in training dynamics, where traditional bilevel methods fail due to the infinite-dimensiona… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08808v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08808v1-abstract-full" style="display: none;"> Diffusion models, which iteratively denoise data samples to synthesize high-quality outputs, have achieved empirical success across domains. However, optimizing these models for downstream tasks often involves nested bilevel structures, such as tuning hyperparameters for fine-tuning tasks or noise schedules in training dynamics, where traditional bilevel methods fail due to the infinite-dimensional probability space and prohibitive sampling costs. We formalize this challenge as a generative bilevel optimization problem and address two key scenarios: (1) fine-tuning pre-trained models via an inference-only lower-level solver paired with a sample-efficient gradient estimator for the upper level, and (2) training diffusion models from scratch with noise schedule optimization by reparameterizing the lower-level problem and designing a computationally tractable gradient estimator. Our first-order bilevel framework overcomes the incompatibility of conventional bilevel methods with diffusion processes, offering theoretical grounding and computational practicality. Experiments demonstrate that our method outperforms existing fine-tuning and hyperparameter search baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08808v1-abstract-full').style.display = 'none'; document.getElementById('2502.08808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08449">arXiv:2502.08449</a> <span> [<a href="https://arxiv.org/pdf/2502.08449">pdf</a>, <a href="https://arxiv.org/format/2502.08449">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CordViP: Correspondence-based Visuomotor Policy for Dexterous Manipulation in Real-World </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yankai Fu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Q">Qiuxuan Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+N">Ning Chen</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Z">Zichen Zhou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mengzhen Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingdong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianxing Chen</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+S">Shanyu Rong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jiaming Liu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08449v1-abstract-short" style="display: inline;"> Achieving human-level dexterity in robots is a key objective in the field of robotic manipulation. Recent advancements in 3D-based imitation learning have shown promising results, providing an effective pathway to achieve this goal. However, obtaining high-quality 3D representations presents two key problems: (1) the quality of point clouds captured by a single-view camera is significantly affecte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08449v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08449v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08449v1-abstract-full" style="display: none;"> Achieving human-level dexterity in robots is a key objective in the field of robotic manipulation. Recent advancements in 3D-based imitation learning have shown promising results, providing an effective pathway to achieve this goal. However, obtaining high-quality 3D representations presents two key problems: (1) the quality of point clouds captured by a single-view camera is significantly affected by factors such as camera resolution, positioning, and occlusions caused by the dexterous hand; (2) the global point clouds lack crucial contact information and spatial correspondences, which are necessary for fine-grained dexterous manipulation tasks. To eliminate these limitations, we propose CordViP, a novel framework that constructs and learns correspondences by leveraging the robust 6D pose estimation of objects and robot proprioception. Specifically, we first introduce the interaction-aware point clouds, which establish correspondences between the object and the hand. These point clouds are then used for our pre-training policy, where we also incorporate object-centric contact maps and hand-arm coordination information, effectively capturing both spatial and temporal dynamics. Our method demonstrates exceptional dexterous manipulation capabilities with an average success rate of 90\% in four real-world tasks, surpassing other baselines by a large margin. Experimental results also highlight the superior generalization and robustness of CordViP to different objects, viewpoints, and scenarios. Code and videos are available on https://aureleopku.github.io/CordViP. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08449v1-abstract-full').style.display = 'none'; document.getElementById('2502.08449v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08445">arXiv:2502.08445</a> <span> [<a href="https://arxiv.org/pdf/2502.08445">pdf</a>, <a href="https://arxiv.org/format/2502.08445">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> LucidAtlas$: Learning Uncertainty-Aware, Covariate-Disentangled, Individualized Atlas Representations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+Y">Yining Jiao</a>, <a href="/search/cs?searchtype=author&query=Bhamidi%2C+S">Sreekalyani Bhamidi</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+H">Huaizhi Qu</a>, <a href="/search/cs?searchtype=author&query=Zdanski%2C+C">Carlton Zdanski</a>, <a href="/search/cs?searchtype=author&query=Kimbell%2C+J">Julia Kimbell</a>, <a href="/search/cs?searchtype=author&query=Prince%2C+A">Andrew Prince</a>, <a href="/search/cs?searchtype=author&query=Worden%2C+C">Cameron Worden</a>, <a href="/search/cs?searchtype=author&query=Kirse%2C+S">Samuel Kirse</a>, <a href="/search/cs?searchtype=author&query=Rutter%2C+C">Christopher Rutter</a>, <a href="/search/cs?searchtype=author&query=Shields%2C+B">Benjamin Shields</a>, <a href="/search/cs?searchtype=author&query=Dunn%2C+W">William Dunn</a>, <a href="/search/cs?searchtype=author&query=Mahmud%2C+J">Jisan Mahmud</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Niethammer%2C+M">Marc Niethammer</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08445v2-abstract-short" style="display: inline;"> The goal of this work is to develop principled techniques to extract information from high dimensional data sets with complex dependencies in areas such as medicine that can provide insight into individual as well as population level variation. We develop $\texttt{LucidAtlas}$, an approach that can represent spatially varying information, and can capture the influence of covariates as well as popu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08445v2-abstract-full').style.display = 'inline'; document.getElementById('2502.08445v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08445v2-abstract-full" style="display: none;"> The goal of this work is to develop principled techniques to extract information from high dimensional data sets with complex dependencies in areas such as medicine that can provide insight into individual as well as population level variation. We develop $\texttt{LucidAtlas}$, an approach that can represent spatially varying information, and can capture the influence of covariates as well as population uncertainty. As a versatile atlas representation, $\texttt{LucidAtlas}$ offers robust capabilities for covariate interpretation, individualized prediction, population trend analysis, and uncertainty estimation, with the flexibility to incorporate prior knowledge. Additionally, we discuss the trustworthiness and potential risks of neural additive models for analyzing dependent covariates and then introduce a marginalization approach to explain the dependence of an individual predictor on the models' response (the atlas). To validate our method, we demonstrate its generalizability on two medical datasets. Our findings underscore the critical role of by-construction interpretable models in advancing scientific discovery. Our code will be publicly available upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08445v2-abstract-full').style.display = 'none'; document.getElementById('2502.08445v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07971">arXiv:2502.07971</a> <span> [<a href="https://arxiv.org/pdf/2502.07971">pdf</a>, <a href="https://arxiv.org/format/2502.07971">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ReTreever: Tree-based Coarse-to-Fine Representations for Retrieval </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+S">Shubham Gupta</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zichao Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a>, <a href="/search/cs?searchtype=author&query=Subakan%2C+C">Cem Subakan</a>, <a href="/search/cs?searchtype=author&query=Reddy%2C+S">Siva Reddy</a>, <a href="/search/cs?searchtype=author&query=Taslakian%2C+P">Perouz Taslakian</a>, <a href="/search/cs?searchtype=author&query=Zantedeschi%2C+V">Valentina Zantedeschi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07971v1-abstract-short" style="display: inline;"> Document retrieval is a core component of question-answering systems, as it enables conditioning answer generation on new and large-scale corpora. While effective, the standard practice of encoding documents into high-dimensional embeddings for similarity search entails large memory and compute footprints, and also makes it hard to inspect the inner workings of the system. In this paper, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07971v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07971v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07971v1-abstract-full" style="display: none;"> Document retrieval is a core component of question-answering systems, as it enables conditioning answer generation on new and large-scale corpora. While effective, the standard practice of encoding documents into high-dimensional embeddings for similarity search entails large memory and compute footprints, and also makes it hard to inspect the inner workings of the system. In this paper, we propose a tree-based method for organizing and representing reference documents at various granular levels, which offers the flexibility to balance cost and utility, and eases the inspection of the corpus content and retrieval operations. Our method, called ReTreever, jointly learns a routing function per internal node of a binary tree such that query and reference documents are assigned to similar tree branches, hence directly optimizing for retrieval performance. Our evaluations show that ReTreever generally preserves full representation accuracy. Its hierarchical structure further provides strong coarse representations and enhances transparency by indirectly learning meaningful semantic groupings. Among hierarchical retrieval methods, ReTreever achieves the best retrieval accuracy at the lowest latency, proving that this family of techniques can be viable in practical applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07971v1-abstract-full').style.display = 'none'; document.getElementById('2502.07971v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.7; E.2; H.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07942">arXiv:2502.07942</a> <span> [<a href="https://arxiv.org/pdf/2502.07942">pdf</a>, <a href="https://arxiv.org/format/2502.07942">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Symbiotic Cooperation for Web Agents: Harnessing Complementary Strengths of Large and Small LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruichen Zhang</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+M">Mufan Qiu</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+Z">Zhen Tan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+V">Vincent Lu</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jie Peng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a>, <a href="/search/cs?searchtype=author&query=Agudelo%2C+L+Z">Leandro Z. Agudelo</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+P">Peter Qian</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07942v1-abstract-short" style="display: inline;"> Web browsing agents powered by large language models (LLMs) have shown tremendous potential in automating complex web-based tasks. Existing approaches typically rely on large LLMs (e.g., GPT-4o) to explore web environments and generate trajectory data, which is then used either for demonstration retrieval (for large LLMs) or to distill small LLMs (e.g., Llama3) in a process that remains decoupled… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07942v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07942v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07942v1-abstract-full" style="display: none;"> Web browsing agents powered by large language models (LLMs) have shown tremendous potential in automating complex web-based tasks. Existing approaches typically rely on large LLMs (e.g., GPT-4o) to explore web environments and generate trajectory data, which is then used either for demonstration retrieval (for large LLMs) or to distill small LLMs (e.g., Llama3) in a process that remains decoupled from the exploration. In this paper, we propose AgentSymbiotic, an iterative framework that couples data synthesis with task-performance, yielding a "symbiotic improvement" for both large and small LLMs. Our study uncovers a complementary dynamic between LLM types: while large LLMs excel at generating high-quality trajectories for distillation, the distilled small LLMs-owing to their distinct reasoning capabilities-often choose actions that diverge from those of their larger counterparts. This divergence drives the exploration of novel trajectories, thereby enriching the synthesized data. However, we also observe that the performance of small LLMs becomes a bottleneck in this iterative enhancement process. To address this, we propose two innovations in LLM distillation: a speculative data synthesis strategy that mitigates off-policy bias, and a multi-task learning approach designed to boost the reasoning capabilities of the student LLM. Furthermore, we introduce a Hybrid Mode for Privacy Preservation to address user privacy concerns. Evaluated on the WEBARENA benchmark, AgentSymbiotic achieves SOTA performance with both LLM types. Our best Large LLM agent reaches 52%, surpassing the previous best of 45%, while our 8B distilled model demonstrates a competitive 49%, exceeding the prior best of 28%. Code will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07942v1-abstract-full').style.display = 'none'; document.getElementById('2502.07942v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07056">arXiv:2502.07056</a> <span> [<a href="https://arxiv.org/pdf/2502.07056">pdf</a>, <a href="https://arxiv.org/format/2502.07056">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Autonomous Deep Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+A">Amy Yu</a>, <a href="/search/cs?searchtype=author&query=Lebedev%2C+E">Erik Lebedev</a>, <a href="/search/cs?searchtype=author&query=Everett%2C+L">Lincoln Everett</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoxin Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Terry Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07056v1-abstract-short" style="display: inline;"> This technical brief introduces Deep Agent, an advanced autonomous AI system designed to manage complex multi-phase tasks through a novel hierarchical task management architecture. The system's foundation is built on our Hierarchical Task DAG (HTDAG) framework, which dynamically decomposes high-level objectives into manageable sub-tasks while rigorously maintaining dependencies and execution coher… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07056v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07056v1-abstract-full" style="display: none;"> This technical brief introduces Deep Agent, an advanced autonomous AI system designed to manage complex multi-phase tasks through a novel hierarchical task management architecture. The system's foundation is built on our Hierarchical Task DAG (HTDAG) framework, which dynamically decomposes high-level objectives into manageable sub-tasks while rigorously maintaining dependencies and execution coherence. Deep Agent advances beyond traditional agent systems through three key innovations: First, it implements a recursive two-stage planner-executor architecture that enables continuous task refinement and adaptation as circumstances change. Second, it features an Autonomous API & Tool Creation (AATC) system that automatically generates reusable components from UI interactions, substantially reducing operational costs for similar tasks. Third, it incorporates Prompt Tweaking Engine and Autonomous Prompt Feedback Learning components that optimize Large Language Model prompts for specific scenarios, enhancing both inference accuracy and operational stability. These components are integrated to form a service infrastructure that manages user contexts, handles complex task dependencies, and orchestrates end-to-end agentic workflow execution. Through this sophisticated architecture, Deep Agent establishes a novel paradigm in self-governing AI systems, demonstrating robust capability to independently handle intricate, multi-step tasks while maintaining consistent efficiency and reliability through continuous self-optimization. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07056v1-abstract-full').style.display = 'none'; document.getElementById('2502.07056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.6; I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06784">arXiv:2502.06784</a> <span> [<a href="https://arxiv.org/pdf/2502.06784">pdf</a>, <a href="https://arxiv.org/format/2502.06784">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> RelGNN: Composite Message Passing for Relational Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlang Chen</a>, <a href="/search/cs?searchtype=author&query=Kanatsoulis%2C+C">Charilaos Kanatsoulis</a>, <a href="/search/cs?searchtype=author&query=Leskovec%2C+J">Jure Leskovec</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06784v1-abstract-short" style="display: inline;"> Predictive tasks on relational databases are critical in real-world applications spanning e-commerce, healthcare, and social media. To address these tasks effectively, Relational Deep Learning (RDL) encodes relational data as graphs, enabling Graph Neural Networks (GNNs) to exploit relational structures for improved predictions. However, existing heterogeneous GNNs often overlook the intrinsic str… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06784v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06784v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06784v1-abstract-full" style="display: none;"> Predictive tasks on relational databases are critical in real-world applications spanning e-commerce, healthcare, and social media. To address these tasks effectively, Relational Deep Learning (RDL) encodes relational data as graphs, enabling Graph Neural Networks (GNNs) to exploit relational structures for improved predictions. However, existing heterogeneous GNNs often overlook the intrinsic structural properties of relational databases, leading to modeling inefficiencies. Here we introduce RelGNN, a novel GNN framework specifically designed to capture the unique characteristics of relational databases. At the core of our approach is the introduction of atomic routes, which are sequences of nodes forming high-order tripartite structures. Building upon these atomic routes, RelGNN designs new composite message passing mechanisms between heterogeneous nodes, allowing direct single-hop interactions between them. This approach avoids redundant aggregations and mitigates information entanglement, ultimately leading to more efficient and accurate predictive modeling. RelGNN is evaluated on 30 diverse real-world tasks from RelBench (Fey et al., 2024), and consistently achieves state-of-the-art accuracy with up to 25% improvement. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06784v1-abstract-full').style.display = 'none'; document.getElementById('2502.06784v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">14 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06494">arXiv:2502.06494</a> <span> [<a href="https://arxiv.org/pdf/2502.06494">pdf</a>, <a href="https://arxiv.org/format/2502.06494">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> GuideLLM: Exploring LLM-Guided Conversation with Applications in Autobiography Interviewing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Duan%2C+J">Jinhao Duan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuoxuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Ko%2C+E">Eunhye Ko</a>, <a href="/search/cs?searchtype=author&query=Boddy%2C+L">Lily Boddy</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chenan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianhao Li</a>, <a href="/search/cs?searchtype=author&query=Rasgon%2C+A">Alexander Rasgon</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J">Junyuan Hong</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+M+K">Min Kyung Lee</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+C">Chenxi Yuan</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Q">Qi Long</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Y">Ying Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kaidi Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06494v1-abstract-short" style="display: inline;"> Although Large Language Models (LLMs) succeed in human-guided conversations such as instruction following and question answering, the potential of LLM-guided conversations-where LLMs direct the discourse and steer the conversation's objectives-remains under-explored. In this study, we first characterize LLM-guided conversation into three fundamental components: (i) Goal Navigation; (ii) Context Ma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06494v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06494v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06494v1-abstract-full" style="display: none;"> Although Large Language Models (LLMs) succeed in human-guided conversations such as instruction following and question answering, the potential of LLM-guided conversations-where LLMs direct the discourse and steer the conversation's objectives-remains under-explored. In this study, we first characterize LLM-guided conversation into three fundamental components: (i) Goal Navigation; (ii) Context Management; (iii) Empathetic Engagement, and propose GuideLLM as an installation. We then implement an interviewing environment for the evaluation of LLM-guided conversation. Specifically, various topics are involved in this environment for comprehensive interviewing evaluation, resulting in around 1.4k turns of utterances, 184k tokens, and over 200 events mentioned during the interviewing for each chatbot evaluation. We compare GuideLLM with 6 state-of-the-art LLMs such as GPT-4o and Llama-3-70b-Instruct, from the perspective of interviewing quality, and autobiography generation quality. For automatic evaluation, we derive user proxies from multiple autobiographies and employ LLM-as-a-judge to score LLM behaviors. We further conduct a human-involved experiment by employing 45 human participants to chat with GuideLLM and baselines. We then collect human feedback, preferences, and ratings regarding the qualities of conversation and autobiography. Experimental results indicate that GuideLLM significantly outperforms baseline LLMs in automatic evaluation and achieves consistent leading performances in human ratings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06494v1-abstract-full').style.display = 'none'; document.getElementById('2502.06494v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">31 pages; the first three authors contributed equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06309">arXiv:2502.06309</a> <span> [<a href="https://arxiv.org/pdf/2502.06309">pdf</a>, <a href="https://arxiv.org/format/2502.06309">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Analog In-memory Training on General Non-ideal Resistive Elements: The Impact of Response Functions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhaoxian Wu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Quan Xiao</a>, <a href="/search/cs?searchtype=author&query=Gokmen%2C+T">Tayfun Gokmen</a>, <a href="/search/cs?searchtype=author&query=Fagbohungbe%2C+O">Omobayode Fagbohungbe</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06309v2-abstract-short" style="display: inline;"> As the economic and environmental costs of training and deploying large vision or language models increase dramatically, analog in-memory computing (AIMC) emerges as a promising energy-efficient solution. However, the training perspective, especially its training dynamic, is underexplored. In AIMC hardware, the trainable weights are represented by the conductance of resistive elements and updated… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06309v2-abstract-full').style.display = 'inline'; document.getElementById('2502.06309v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06309v2-abstract-full" style="display: none;"> As the economic and environmental costs of training and deploying large vision or language models increase dramatically, analog in-memory computing (AIMC) emerges as a promising energy-efficient solution. However, the training perspective, especially its training dynamic, is underexplored. In AIMC hardware, the trainable weights are represented by the conductance of resistive elements and updated using consecutive electrical pulses. Among all the physical properties of resistive elements, the response to the pulses directly affects the training dynamics. This paper first provides a theoretical foundation for gradient-based training on AIMC hardware and studies the impact of response functions. We demonstrate that noisy update and asymmetric response functions negatively impact Analog SGD by imposing an implicit penalty term on the objective. To overcome the issue, Tiki-Taka, a residual learning algorithm, converges exactly to a critical point by optimizing a main array and a residual array bilevelly. The conclusion is supported by simulations validating our theoretical insights. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06309v2-abstract-full').style.display = 'none'; document.getElementById('2502.06309v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06189">arXiv:2502.06189</a> <span> [<a href="https://arxiv.org/pdf/2502.06189">pdf</a>, <a href="https://arxiv.org/format/2502.06189">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Level Decoupled Relational Distillation for Heterogeneous Architectures </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaoxin Yang</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+P">Peng Ye</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weihao Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kangcong Li</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yan Wen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+J">Jia Hao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06189v1-abstract-short" style="display: inline;"> Heterogeneous distillation is an effective way to transfer knowledge from cross-architecture teacher models to student models. However, existing heterogeneous distillation methods do not take full advantage of the dark knowledge hidden in the teacher's output, limiting their performance.To this end, we propose a novel framework named Multi-Level Decoupled Relational Knowledge Distillation (MLDR-KD… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06189v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06189v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06189v1-abstract-full" style="display: none;"> Heterogeneous distillation is an effective way to transfer knowledge from cross-architecture teacher models to student models. However, existing heterogeneous distillation methods do not take full advantage of the dark knowledge hidden in the teacher's output, limiting their performance.To this end, we propose a novel framework named Multi-Level Decoupled Relational Knowledge Distillation (MLDR-KD) to unleash the potential of relational distillation in heterogeneous distillation. Concretely, we first introduce Decoupled Finegrained Relation Alignment (DFRA) in both logit and feature levels to balance the trade-off between distilled dark knowledge and the confidence in the correct category of the heterogeneous teacher model. Then, Multi-Scale Dynamic Fusion (MSDF) module is applied to dynamically fuse the projected logits of multiscale features at different stages in student model, further improving performance of our method in feature level. We verify our method on four architectures (CNNs, Transformers, MLPs and Mambas), two datasets (CIFAR-100 and Tiny-ImageNet). Compared with the best available method, our MLDR-KD improves student model performance with gains of up to 4.86% on CIFAR-100 and 2.78% on Tiny-ImageNet datasets respectively, showing robustness and generality in heterogeneous distillation. Code will be released soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06189v1-abstract-full').style.display = 'none'; document.getElementById('2502.06189v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05835">arXiv:2502.05835</a> <span> [<a href="https://arxiv.org/pdf/2502.05835">pdf</a>, <a href="https://arxiv.org/format/2502.05835">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Contrastive Representation Distillation via Multi-Scale Feature Decoupling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Cuipeng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haipeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05835v1-abstract-short" style="display: inline;"> Knowledge distillation is a technique aimed at enhancing the performance of a smaller student network without increasing its parameter size by transferring knowledge from a larger, pre-trained teacher network. Previous approaches have predominantly focused on distilling global feature information while overlooking the importance of disentangling the diverse types of information embedded within dif… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05835v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05835v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05835v1-abstract-full" style="display: none;"> Knowledge distillation is a technique aimed at enhancing the performance of a smaller student network without increasing its parameter size by transferring knowledge from a larger, pre-trained teacher network. Previous approaches have predominantly focused on distilling global feature information while overlooking the importance of disentangling the diverse types of information embedded within different regions of the feature. In this work, we introduce multi-scale decoupling in the feature transfer process for the first time, where the decoupled local features are individually processed and integrated with contrastive learning. Moreover, compared to previous contrastive learning-based distillation methods, our approach not only reduces computational costs but also enhances efficiency, enabling performance improvements for the student network using only single-batch samples. Extensive evaluations on CIFAR-100 and ImageNet demonstrate our method's superiority, with some student networks distilled using our method even surpassing the performance of their pre-trained teacher networks. These results underscore the effectiveness of our approach in enabling student networks to thoroughly absorb knowledge from teacher networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05835v1-abstract-full').style.display = 'none'; document.getElementById('2502.05835v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05431">arXiv:2502.05431</a> <span> [<a href="https://arxiv.org/pdf/2502.05431">pdf</a>, <a href="https://arxiv.org/format/2502.05431">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> APE: Faster and Longer Context-Augmented Generation via Adaptive Parallel Encoding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xinyu Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianqi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Beidi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05431v2-abstract-short" style="display: inline;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'inline'; document.getElementById('2502.05431v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05431v2-abstract-full" style="display: none;"> Context-augmented generation (CAG) techniques, including RAG and ICL, require the efficient combination of multiple contexts to generate responses to user queries. Directly inputting these contexts as a sequence introduces a considerable computational burden by re-encoding the combined selection of contexts for every request. To address this, we explore the promising potential of parallel encoding to independently pre-compute and cache each context's KV states. This approach enables the direct loading of cached states during inference while accommodating more contexts through position reuse across contexts. However, due to misalignments in attention distribution, directly applying parallel encoding results in a significant performance drop. To enable effective and efficient CAG, we propose Adaptive Parallel Encoding ($\textbf{APE}$), which brings shared prefix, attention temperature, and scaling factor to align the distribution of parallel encoding with sequential encoding. Results on RAG and ICL tasks demonstrate that APE can preserve 98% and 93% sequential encoding performance using the same inputs while outperforming parallel encoding by 3.6% and 7.9%, respectively. It also scales to many-shot CAG, effectively encoding hundreds of contexts in parallel. Efficiency evaluation shows that APE can achieve an end-to-end 4.5$\times$ speedup by reducing 28$\times$ prefilling time for a 128K-length context. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05431v2-abstract-full').style.display = 'none'; document.getElementById('2502.05431v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICLR 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.05255">arXiv:2502.05255</a> <span> [<a href="https://arxiv.org/pdf/2502.05255">pdf</a>, <a href="https://arxiv.org/format/2502.05255">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> </div> </div> <p class="title is-5 mathjax"> Incivility and Contentiousness Spillover between COVID-19 and Climate Science Engagement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Narimanzadeh%2C+H">Hasti Narimanzadeh</a>, <a href="/search/cs?searchtype=author&query=Badie-Modiri%2C+A">Arash Badie-Modiri</a>, <a href="/search/cs?searchtype=author&query=Smirnova%2C+I">Iuliia Smirnova</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T+H+Y">Ted Hsuan Yun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.05255v1-abstract-short" style="display: inline;"> Affective polarization and its accompanying cleavage-based sorting drives incivility and contentiousness around climate change and other science-related issues. Looking at the COVID-19 period, we study cross-domain spillover of incivility and contentiousness in public engagements with climate change and climate science on Twitter and Reddit. We find strong evidence of the signatures of affective p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05255v1-abstract-full').style.display = 'inline'; document.getElementById('2502.05255v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.05255v1-abstract-full" style="display: none;"> Affective polarization and its accompanying cleavage-based sorting drives incivility and contentiousness around climate change and other science-related issues. Looking at the COVID-19 period, we study cross-domain spillover of incivility and contentiousness in public engagements with climate change and climate science on Twitter and Reddit. We find strong evidence of the signatures of affective polarization surrounding COVID-19 spilling into the climate change domain. Across different social media systems, COVID-19 content is associated with incivility and contentiousness in climate discussions. These patterns of increased antagonism were responsive to pandemic events that made the link between science and public policy more salient. We also show that the observed spillover activated along pre-pandemic political cleavages, specifically anti-internationalist populist beliefs, that linked climate policy opposition to vaccine hesitancy. Our findings highlight the dangers of entrenched cross-domain polarization manifesting as spillover of antagonistic behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.05255v1-abstract-full').style.display = 'none'; document.getElementById('2502.05255v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.04202">arXiv:2502.04202</a> <span> [<a href="https://arxiv.org/pdf/2502.04202">pdf</a>, <a href="https://arxiv.org/format/2502.04202">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> GUIWatcher: Automatically Detecting GUI Lags by Analyzing Mobile Application Screencasts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wei Liu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+F">Feng Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Linqiang Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tse-Hsun Chen</a>, <a href="/search/cs?searchtype=author&query=Hassan%2C+A+E">Ahmed E. Hassan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.04202v1-abstract-short" style="display: inline;"> The Graphical User Interface (GUI) plays a central role in mobile applications, directly affecting usability and user satisfaction. Poor GUI performance, such as lag or unresponsiveness, can lead to negative user experience and decreased mobile application (app) ratings. In this paper, we present GUIWatcher, a framework designed to detect GUI lags by analyzing screencasts recorded during mobile ap… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04202v1-abstract-full').style.display = 'inline'; document.getElementById('2502.04202v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.04202v1-abstract-full" style="display: none;"> The Graphical User Interface (GUI) plays a central role in mobile applications, directly affecting usability and user satisfaction. Poor GUI performance, such as lag or unresponsiveness, can lead to negative user experience and decreased mobile application (app) ratings. In this paper, we present GUIWatcher, a framework designed to detect GUI lags by analyzing screencasts recorded during mobile app testing. GUIWatcher uses computer vision techniques to identify three types of lag-inducing frames (i.e., janky frames, long loading frames, and frozen frames) and prioritizes the most severe ones that significantly impact user experience. Our approach was evaluated using real-world mobile application tests, achieving high accuracy in detecting GUI lags in screencasts, with an average precision of 0.91 and recall of 0.96. The comprehensive bug reports generated from the lags detected by GUIWatcher help developers focus on the more critical issues and debug them efficiently. Additionally, GUIWatcher has been deployed in a real-world production environment, continuously monitoring app performance and successfully identifying critical GUI performance issues. By offering a practical solution for identifying and addressing GUI lags, GUIWatcher contributes to enhancing user satisfaction and the overall quality of mobile apps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.04202v1-abstract-full').style.display = 'none'; document.getElementById('2502.04202v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ICSE-SEIP 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.03674">arXiv:2502.03674</a> <span> [<a href="https://arxiv.org/pdf/2502.03674">pdf</a>, <a href="https://arxiv.org/format/2502.03674">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study of Methods for Small Object Detection from Satellite Imagery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaohui Yuan</a>, <a href="/search/cs?searchtype=author&query=Chakravarty%2C+A">Aniv Chakravarty</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lichuan Gu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Z">Zhenchun Wei</a>, <a href="/search/cs?searchtype=author&query=Lichtenberg%2C+E">Elinor Lichtenberg</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tian Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.03674v1-abstract-short" style="display: inline;"> This paper reviews object detection methods for finding small objects from remote sensing imagery and provides an empirical evaluation of four state-of-the-art methods to gain insights into method performance and technical challenges. In particular, we use car detection from urban satellite images and bee box detection from satellite images of agricultural lands as application scenarios. Drawing f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03674v1-abstract-full').style.display = 'inline'; document.getElementById('2502.03674v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.03674v1-abstract-full" style="display: none;"> This paper reviews object detection methods for finding small objects from remote sensing imagery and provides an empirical evaluation of four state-of-the-art methods to gain insights into method performance and technical challenges. In particular, we use car detection from urban satellite images and bee box detection from satellite images of agricultural lands as application scenarios. Drawing from the existing surveys and literature, we identify several top-performing methods for the empirical study. Public, high-resolution satellite image datasets are used in our experiments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.03674v1-abstract-full').style.display = 'none'; document.getElementById('2502.03674v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01670">arXiv:2502.01670</a> <span> [<a href="https://arxiv.org/pdf/2502.01670">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Hardware-Efficient Photonic Tensor Core: Accelerating Deep Neural Networks with Structured Compression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ning%2C+S">Shupeng Ning</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hanqing Zhu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chenghao Feng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+J">Jiaqi Gu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+D+Z">David Z. Pan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+R+T">Ray T. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01670v1-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence (AI) and deep neural networks (DNNs) have revolutionized numerous fields, enabling complex tasks by extracting intricate features from large datasets. However, the exponential growth in computational demands has outstripped the capabilities of traditional electrical hardware accelerators. Optical computing offers a promising alternative due to its inh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01670v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01670v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01670v1-abstract-full" style="display: none;"> Recent advancements in artificial intelligence (AI) and deep neural networks (DNNs) have revolutionized numerous fields, enabling complex tasks by extracting intricate features from large datasets. However, the exponential growth in computational demands has outstripped the capabilities of traditional electrical hardware accelerators. Optical computing offers a promising alternative due to its inherent advantages of parallelism, high computational speed, and low power consumption. Yet, current photonic integrated circuits (PICs) designed for general matrix multiplication (GEMM) are constrained by large footprints, high costs of electro-optical (E-O) interfaces, and high control complexity, limiting their scalability. To overcome these challenges, we introduce a block-circulant photonic tensor core (CirPTC) for a structure-compressed optical neural network (StrC-ONN) architecture. By applying a structured compression strategy to weight matrices, StrC-ONN significantly reduces model parameters and hardware requirements while preserving the universal representability of networks and maintaining comparable expressivity. Additionally, we propose a hardware-aware training framework to compensate for on-chip nonidealities to improve model robustness and accuracy. We experimentally demonstrate image processing and classification tasks, achieving up to a 74.91% reduction in trainable parameters while maintaining competitive accuracies. Performance analysis expects a computational density of 5.84 tera operations per second (TOPS) per mm^2 and a power efficiency of 47.94 TOPS/W, marking a 6.87-times improvement achieved through the hardware-software co-design approach. By reducing both hardware requirements and control complexity across multiple dimensions, this work explores a new pathway to push the limits of optical computing in the pursuit of high efficiency and scalability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01670v1-abstract-full').style.display = 'none'; document.getElementById('2502.01670v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00361">arXiv:2502.00361</a> <span> [<a href="https://arxiv.org/pdf/2502.00361">pdf</a>, <a href="https://arxiv.org/format/2502.00361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Soft Diffusion Actor-Critic: Efficient Online Reinforcement Learning for Diffusion Policy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+H">Haitong Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Na Li</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+B">Bo Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00361v2-abstract-short" style="display: inline;"> Diffusion policies have achieved superior performance in imitation learning and offline reinforcement learning (RL) due to their rich expressiveness. However, the vanilla diffusion training procedure requires samples from target distribution, which is impossible in online RL since we cannot sample from the optimal policy, making training diffusion policies highly non-trivial in online RL. Backprop… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00361v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00361v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00361v2-abstract-full" style="display: none;"> Diffusion policies have achieved superior performance in imitation learning and offline reinforcement learning (RL) due to their rich expressiveness. However, the vanilla diffusion training procedure requires samples from target distribution, which is impossible in online RL since we cannot sample from the optimal policy, making training diffusion policies highly non-trivial in online RL. Backpropagating policy gradient through the diffusion process incurs huge computational costs and instability, thus being expensive and impractical. To enable efficient diffusion policy training for online RL, we propose Soft Diffusion Actor-Critic (SDAC), exploiting the viewpoint of diffusion models as noise-perturbed energy-based models. The proposed SDAC relies solely on the state-action value function as the energy functions to train diffusion policies, bypassing sampling from the optimal policy while maintaining lightweight computations. We conducted comprehensive comparisons on MuJoCo benchmarks. The empirical results show that SDAC outperforms all recent diffusion-policy online RLs on most tasks, and improves more than 120% over soft actor-critic on complex locomotion tasks such as Humanoid and Ant. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00361v2-abstract-full').style.display = 'none'; document.getElementById('2502.00361v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00334">arXiv:2502.00334</a> <span> [<a href="https://arxiv.org/pdf/2502.00334">pdf</a>, <a href="https://arxiv.org/format/2502.00334">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UGPhysics: A Comprehensive Benchmark for Undergraduate Physics Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Q">Qiyun Xu</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tong Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yuchen Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+S">Shizhe Diao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Can Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00334v2-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated remarkable capabilities in solving complex reasoning tasks, particularly in mathematics. However, the domain of physics reasoning presents unique challenges that have received significantly less attention. Existing benchmarks often fall short in evaluating LLMs' abilities on the breadth and depth of undergraduate-level physics, underscoring the need f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00334v2-abstract-full').style.display = 'inline'; document.getElementById('2502.00334v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00334v2-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated remarkable capabilities in solving complex reasoning tasks, particularly in mathematics. However, the domain of physics reasoning presents unique challenges that have received significantly less attention. Existing benchmarks often fall short in evaluating LLMs' abilities on the breadth and depth of undergraduate-level physics, underscoring the need for a comprehensive evaluation. To fill this gap, we introduce UGPhysics, a large-scale and comprehensive benchmark specifically designed to evaluate UnderGraduate-level Physics (UGPhysics) reasoning with LLMs. UGPhysics includes 5,520 undergraduate-level physics problems in both English and Chinese, covering 13 subjects with seven different answer types and four distinct physics reasoning skills, all rigorously screened for data leakage. Additionally, we develop a Model-Assistant Rule-based Judgment (MARJ) pipeline specifically tailored for assessing answer correctness of physics problems, ensuring accurate evaluation. Our evaluation of 31 leading LLMs shows that the highest overall accuracy, 49.8% (achieved by OpenAI-o1-mini), emphasizes the necessity for models with stronger physics reasoning skills, beyond math abilities. We hope UGPhysics, along with MARJ, will drive future advancements in AI for physics reasoning. Codes and data are available at https://github.com/YangLabHKUST/UGPhysics . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00334v2-abstract-full').style.display = 'none'; document.getElementById('2502.00334v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.19329">arXiv:2501.19329</a> <span> [<a href="https://arxiv.org/pdf/2501.19329">pdf</a>, <a href="https://arxiv.org/format/2501.19329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Let Human Sketches Help: Empowering Challenging Image Segmentation Task with Freehand Sketches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zang%2C+Y">Ying Zang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+R">Runlong Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianqi Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yidong Han</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Ziyue Cao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenjun Hu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Didi Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zejian Li</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Deyi Ji</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianrun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.19329v1-abstract-short" style="display: inline;"> Sketches, with their expressive potential, allow humans to convey the essence of an object through even a rough contour. For the first time, we harness this expressive potential to improve segmentation performance in challenging tasks like camouflaged object detection (COD). Our approach introduces an innovative sketch-guided interactive segmentation framework, allowing users to intuitively annota… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19329v1-abstract-full').style.display = 'inline'; document.getElementById('2501.19329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.19329v1-abstract-full" style="display: none;"> Sketches, with their expressive potential, allow humans to convey the essence of an object through even a rough contour. For the first time, we harness this expressive potential to improve segmentation performance in challenging tasks like camouflaged object detection (COD). Our approach introduces an innovative sketch-guided interactive segmentation framework, allowing users to intuitively annotate objects with freehand sketches (drawing a rough contour of the object) instead of the traditional bounding boxes or points used in classic interactive segmentation models like SAM. We demonstrate that sketch input can significantly improve performance in existing iterative segmentation methods, outperforming text or bounding box annotations. Additionally, we introduce key modifications to network architectures and a novel sketch augmentation technique to fully harness the power of sketch input and further boost segmentation accuracy. Remarkably, our model' s output can be directly used to train other neural networks, achieving results comparable to pixel-by-pixel annotations--while reducing annotation time by up to 120 times, which shows great potential in democratizing the annotation process and enabling model training with less reliance on resource-intensive, laborious pixel-level annotations. We also present KOSCamo+, the first freehand sketch dataset for camouflaged object detection. The dataset, code, and the labeling tool will be open sourced. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.19329v1-abstract-full').style.display = 'none'; document.getElementById('2501.19329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.18170">arXiv:2501.18170</a> <span> [<a href="https://arxiv.org/pdf/2501.18170">pdf</a>, <a href="https://arxiv.org/format/2501.18170">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Continually Evolved Multimodal Foundation Models for Cancer Prognosis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jie Peng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Shuang Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Longwei Yang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yiran Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mohan Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+K">Kaixiong Zhou</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+F">Feng Xie</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+M">Mingquan Lin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Rui Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.18170v2-abstract-short" style="display: inline;"> Cancer prognosis is a critical task that involves predicting patient outcomes and survival rates. To enhance prediction accuracy, previous studies have integrated diverse data modalities, such as clinical notes, medical images, and genomic data, leveraging their complementary information. However, existing approaches face two major limitations. First, they struggle to incorporate newly arrived dat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18170v2-abstract-full').style.display = 'inline'; document.getElementById('2501.18170v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.18170v2-abstract-full" style="display: none;"> Cancer prognosis is a critical task that involves predicting patient outcomes and survival rates. To enhance prediction accuracy, previous studies have integrated diverse data modalities, such as clinical notes, medical images, and genomic data, leveraging their complementary information. However, existing approaches face two major limitations. First, they struggle to incorporate newly arrived data with varying distributions into training, such as patient records from different hospitals, thus rendering sub-optimal generalizability and limited utility in real-world applications. Second, most multimodal integration methods rely on simplistic concatenation or task-specific pipelines, which fail to capture the complex interdependencies across modalities. To address these, we propose a continually evolving multi-modal foundation model. Extensive experiments on the TCGA dataset demonstrate the effectiveness of our approach, highlighting its potential to advance cancer prognosis by enabling robust and adaptive multimodal integration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.18170v2-abstract-full').style.display = 'none'; document.getElementById('2501.18170v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 1 figure</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> I.2.7; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.17860">arXiv:2501.17860</a> <span> [<a href="https://arxiv.org/pdf/2501.17860">pdf</a>, <a href="https://arxiv.org/format/2501.17860">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Dialogue is Better Than Monologue: Instructing Medical LLMs via Strategical Conversations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zijie Liu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+X">Xinyu Zhao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+J">Jie Peng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zhuangdi Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xia Hu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianlong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.17860v1-abstract-short" style="display: inline;"> Current medical AI systems often fail to replicate real-world clinical reasoning, as they are predominantly trained and evaluated on static text and question-answer tasks. These tuning methods and benchmarks overlook critical aspects like evidence-based reasoning and handling distracting information. To bridge this gap, we introduce a novel benchmark that simulates real-world diagnostic scenarios,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17860v1-abstract-full').style.display = 'inline'; document.getElementById('2501.17860v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.17860v1-abstract-full" style="display: none;"> Current medical AI systems often fail to replicate real-world clinical reasoning, as they are predominantly trained and evaluated on static text and question-answer tasks. These tuning methods and benchmarks overlook critical aspects like evidence-based reasoning and handling distracting information. To bridge this gap, we introduce a novel benchmark that simulates real-world diagnostic scenarios, integrating noise and difficulty levels aligned with USMLE standards. Moreover, we explore dialogue-based fine-tuning, which transforms static datasets into conversational formats to better capture iterative reasoning processes. Experiments show that dialogue-tuned models outperform traditional methods, with improvements of $9.64\%$ in multi-round reasoning scenarios and $6.18\%$ in accuracy in a noisy environment. Our findings highlight dialogue tuning as a promising approach for advancing clinically aligned and robust medical AI systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.17860v1-abstract-full').style.display = 'none'; document.getElementById('2501.17860v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16811">arXiv:2501.16811</a> <span> [<a href="https://arxiv.org/pdf/2501.16811">pdf</a>, <a href="https://arxiv.org/format/2501.16811">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Not Every Patch is Needed: Towards a More Efficient and Effective Backbone for Video-based Person Re-identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lanyun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianrun Chen</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+D">Deyi Ji</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16811v1-abstract-short" style="display: inline;"> This paper proposes a new effective and efficient plug-and-play backbone for video-based person re-identification (ReID). Conventional video-based ReID methods typically use CNN or transformer backbones to extract deep features for every position in every sampled video frame. Here, we argue that this exhaustive feature extraction could be unnecessary, since we find that different frames in a ReID… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16811v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16811v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16811v1-abstract-full" style="display: none;"> This paper proposes a new effective and efficient plug-and-play backbone for video-based person re-identification (ReID). Conventional video-based ReID methods typically use CNN or transformer backbones to extract deep features for every position in every sampled video frame. Here, we argue that this exhaustive feature extraction could be unnecessary, since we find that different frames in a ReID video often exhibit small differences and contain many similar regions due to the relatively slight movements of human beings. Inspired by this, a more selective, efficient paradigm is explored in this paper. Specifically, we introduce a patch selection mechanism to reduce computational cost by choosing only the crucial and non-repetitive patches for feature extraction. Additionally, we present a novel network structure that generates and utilizes pseudo frame global context to address the issue of incomplete views resulting from sparse inputs. By incorporating these new designs, our backbone can achieve both high performance and low computational cost. Extensive experiments on multiple datasets show that our approach reduces the computational cost by 74\% compared to ViT-B and 28\% compared to ResNet50, while the accuracy is on par with ViT-B and outperforms ResNet50 significantly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16811v1-abstract-full').style.display = 'none'; document.getElementById('2501.16811v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE TIP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16590">arXiv:2501.16590</a> <span> [<a href="https://arxiv.org/pdf/2501.16590">pdf</a>, <a href="https://arxiv.org/format/2501.16590">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Benchmarking Model Predictive Control and Reinforcement Learning Based Control for Legged Robot Locomotion in MuJoCo Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Akki%2C+S">Shivayogi Akki</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16590v1-abstract-short" style="display: inline;"> Model Predictive Control (MPC) and Reinforcement Learning (RL) are two prominent strategies for controlling legged robots, each with unique strengths. RL learns control policies through system interaction, adapting to various scenarios, whereas MPC relies on a predefined mathematical model to solve optimization problems in real-time. Despite their widespread use, there is a lack of direct comparat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16590v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16590v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16590v1-abstract-full" style="display: none;"> Model Predictive Control (MPC) and Reinforcement Learning (RL) are two prominent strategies for controlling legged robots, each with unique strengths. RL learns control policies through system interaction, adapting to various scenarios, whereas MPC relies on a predefined mathematical model to solve optimization problems in real-time. Despite their widespread use, there is a lack of direct comparative analysis under standardized conditions. This work addresses this gap by benchmarking MPC and RL controllers on a Unitree Go1 quadruped robot within the MuJoCo simulation environment, focusing on a standardized task-straight walking at a constant velocity. Performance is evaluated based on disturbance rejection, energy efficiency, and terrain adaptability. The results show that RL excels in handling disturbances and maintaining energy efficiency but struggles with generalization to new terrains due to its dependence on learned policies tailored to specific environments. In contrast, MPC shows enhanced recovery capabilities from larger perturbations by leveraging its optimization-based approach, allowing for a balanced distribution of control efforts across the robot's joints. The results provide a clear understanding of the advantages and limitations of both RL and MPC, offering insights into selecting an appropriate control strategy for legged robotic applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16590v1-abstract-full').style.display = 'none'; document.getElementById('2501.16590v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16409">arXiv:2501.16409</a> <span> [<a href="https://arxiv.org/pdf/2501.16409">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Classification of Mild Cognitive Impairment Based on Dynamic Functional Connectivity Using Spatio-Temporal Transformer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yanjun Lyu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaowei Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chao Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minheng Chen</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dajiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16409v1-abstract-short" style="display: inline;"> Dynamic functional connectivity (dFC) using resting-state functional magnetic resonance imaging (rs-fMRI) is an advanced technique for capturing the dynamic changes of neural activities, and can be very useful in the studies of brain diseases such as Alzheimer's disease (AD). Yet, existing studies have not fully leveraged the sequential information embedded within dFC that can potentially provide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16409v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16409v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16409v1-abstract-full" style="display: none;"> Dynamic functional connectivity (dFC) using resting-state functional magnetic resonance imaging (rs-fMRI) is an advanced technique for capturing the dynamic changes of neural activities, and can be very useful in the studies of brain diseases such as Alzheimer's disease (AD). Yet, existing studies have not fully leveraged the sequential information embedded within dFC that can potentially provide valuable information when identifying brain conditions. In this paper, we propose a novel framework that jointly learns the embedding of both spatial and temporal information within dFC based on the transformer architecture. Specifically, we first construct dFC networks from rs-fMRI data through a sliding window strategy. Then, we simultaneously employ a temporal block and a spatial block to capture higher-order representations of dynamic spatio-temporal dependencies, via mapping them into an efficient fused feature representation. To further enhance the robustness of these feature representations by reducing the dependency on labeled data, we also introduce a contrastive learning strategy to manipulate different brain states. Experimental results on 345 subjects with 570 scans from the Alzheimer's Disease Neuroimaging Initiative (ADNI) demonstrate the superiority of our proposed method for MCI (Mild Cognitive Impairment, the prodromal stage of AD) prediction, highlighting its potential for early identification of AD. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16409v1-abstract-full').style.display = 'none'; document.getElementById('2501.16409v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.16282">arXiv:2501.16282</a> <span> [<a href="https://arxiv.org/pdf/2501.16282">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Brain-Adapter: Enhancing Neurological Disorder Analysis with Adapter-Tuning Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiaowei Yu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">Yanjun Lyu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tong Chen</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+C">Chao Cao</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+Y">Yan Zhuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minheng Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+D">Dajiang Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.16282v1-abstract-short" style="display: inline;"> Understanding brain disorders is crucial for accurate clinical diagnosis and treatment. Recent advances in Multimodal Large Language Models (MLLMs) offer a promising approach to interpreting medical images with the support of text descriptions. However, previous research has primarily focused on 2D medical images, leaving richer spatial information of 3D images under-explored, and single-modality-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16282v1-abstract-full').style.display = 'inline'; document.getElementById('2501.16282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.16282v1-abstract-full" style="display: none;"> Understanding brain disorders is crucial for accurate clinical diagnosis and treatment. Recent advances in Multimodal Large Language Models (MLLMs) offer a promising approach to interpreting medical images with the support of text descriptions. However, previous research has primarily focused on 2D medical images, leaving richer spatial information of 3D images under-explored, and single-modality-based methods are limited by overlooking the critical clinical information contained in other modalities. To address this issue, this paper proposes Brain-Adapter, a novel approach that incorporates an extra bottleneck layer to learn new knowledge and instill it into the original pre-trained knowledge. The major idea is to incorporate a lightweight bottleneck layer to train fewer parameters while capturing essential information and utilize a Contrastive Language-Image Pre-training (CLIP) strategy to align multimodal data within a unified representation space. Extensive experiments demonstrated the effectiveness of our approach in integrating multimodal data to significantly improve the diagnosis accuracy without high computational costs, highlighting the potential to enhance real-world diagnostic workflows. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.16282v1-abstract-full').style.display = 'none'; document.getElementById('2501.16282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15475">arXiv:2501.15475</a> <span> [<a href="https://arxiv.org/pdf/2501.15475">pdf</a>, <a href="https://arxiv.org/format/2501.15475">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> The Same Only Different: On Information Modality for Configuration Performance Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hongyuan Liang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yue Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15475v2-abstract-short" style="display: inline;"> Configuration in software systems helps to ensure efficient operation and meet diverse user needs. Yet, some, if not all, configuration options have profound implications for the system's performance. Configuration performance analysis, wherein the key is to understand (or infer) the configuration options' relations and their impacts on performance, is crucial. Two major modalities exist that serv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15475v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15475v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15475v2-abstract-full" style="display: none;"> Configuration in software systems helps to ensure efficient operation and meet diverse user needs. Yet, some, if not all, configuration options have profound implications for the system's performance. Configuration performance analysis, wherein the key is to understand (or infer) the configuration options' relations and their impacts on performance, is crucial. Two major modalities exist that serve as the source information in the analysis: either the manual or source code. However, it remains unclear what roles they play in configuration performance analysis. Much work that relies on manuals claims their benefits of information richness and naturalness; while work that trusts the source code more prefers the structural information provided therein and criticizes the timeliness of manuals. To fill such a gap, in this paper, we conduct an extensive empirical study over 10 systems, covering 1,694 options, 106,798 words in the manual, and 22,859,552 lines-of-code for investigating the usefulness of manual and code in two important tasks of configuration performance analysis, namely performance-sensitive options identification and the associated dependencies extraction. We reveal several new findings and insights, such as it is beneficial to fuse the manual and code modalities for both tasks; the current automated tools that rely on a single modality are far from being practically useful and generally remain incomparable to human analysis. All those pave the way for further advancing configuration performance analysis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15475v2-abstract-full').style.display = 'none'; document.getElementById('2501.15475v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.15392">arXiv:2501.15392</a> <span> [<a href="https://arxiv.org/pdf/2501.15392">pdf</a>, <a href="https://arxiv.org/format/2501.15392">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Faster Configuration Performance Bug Testing with Neural Dual-level Prioritization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Youpeng Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Ke Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.15392v2-abstract-short" style="display: inline;"> As software systems become more complex and configurable, more performance problems tend to arise from the configuration designs. This has caused some configuration options to unexpectedly degrade performance which deviates from their original expectations designed by the developers. Such discrepancies, namely configuration performance bugs (CPBugs), are devastating and can be deeply hidden in the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15392v2-abstract-full').style.display = 'inline'; document.getElementById('2501.15392v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.15392v2-abstract-full" style="display: none;"> As software systems become more complex and configurable, more performance problems tend to arise from the configuration designs. This has caused some configuration options to unexpectedly degrade performance which deviates from their original expectations designed by the developers. Such discrepancies, namely configuration performance bugs (CPBugs), are devastating and can be deeply hidden in the source code. Yet, efficiently testing CPBugs is difficult, not only due to the test oracle is hard to set, but also because the configuration measurement is expensive and there are simply too many possible configurations to test. As such, existing testing tools suffer from lengthy runtime or have been ineffective in detecting CPBugs when the budget is limited, compounded by inaccurate test oracle. In this paper, we seek to achieve significantly faster CPBug testing by neurally prioritizing the testing at both the configuration option and value range levels with automated oracle estimation. Our proposed tool, dubbed NDP, is a general framework that works with different heuristic generators. The idea is to leverage two neural language models: one to estimate the CPBug types that serve as the oracle while, more vitally, the other to infer the probabilities of an option being CPBug-related, based on which the options and the value ranges to be searched can be prioritized. Experiments on several widely-used systems of different versions reveal that NDP can, in general, better predict CPBug type in 87% cases and find more CPBugs with up to 88.88x testing efficiency speedup over the state-of-the-art tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.15392v2-abstract-full').style.display = 'none'; document.getElementById('2501.15392v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by ICSE 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.14147">arXiv:2501.14147</a> <span> [<a href="https://arxiv.org/pdf/2501.14147">pdf</a>, <a href="https://arxiv.org/format/2501.14147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> HAMMER: Heterogeneous, Multi-Robot Semantic Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+J">Javier Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Timothy Chen</a>, <a href="/search/cs?searchtype=author&query=Schwager%2C+M">Mac Schwager</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.14147v1-abstract-short" style="display: inline;"> 3D Gaussian Splatting offers expressive scene reconstruction, modeling a broad range of visual, geometric, and semantic information. However, efficient real-time map reconstruction with data streamed from multiple robots and devices remains a challenge. To that end, we propose HAMMER, a server-based collaborative Gaussian Splatting method that leverages widely available ROS communication infrastru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14147v1-abstract-full').style.display = 'inline'; document.getElementById('2501.14147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.14147v1-abstract-full" style="display: none;"> 3D Gaussian Splatting offers expressive scene reconstruction, modeling a broad range of visual, geometric, and semantic information. However, efficient real-time map reconstruction with data streamed from multiple robots and devices remains a challenge. To that end, we propose HAMMER, a server-based collaborative Gaussian Splatting method that leverages widely available ROS communication infrastructure to generate 3D, metric-semantic maps from asynchronous robot data-streams with no prior knowledge of initial robot positions and varying on-device pose estimators. HAMMER consists of (i) a frame alignment module that transforms local SLAM poses and image data into a global frame and requires no prior relative pose knowledge, and (ii) an online module for training semantic 3DGS maps from streaming data. HAMMER handles mixed perception modes, adjusts automatically for variations in image pre-processing among different devices, and distills CLIP semantic codes into the 3D scene for open-vocabulary language queries. In our real-world experiments, HAMMER creates higher-fidelity maps (2x) compared to competing baselines and is useful for downstream tasks, such as semantic goal-conditioned navigation (e.g., ``go to the couch"). Accompanying content available at hammer-project.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.14147v1-abstract-full').style.display = 'none'; document.getElementById('2501.14147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.13766">arXiv:2501.13766</a> <span> [<a href="https://arxiv.org/pdf/2501.13766">pdf</a>, <a href="https://arxiv.org/format/2501.13766">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> UGMathBench: A Diverse and Dynamic Benchmark for Undergraduate-Level Mathematical Reasoning with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianhao Chen</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+Z">Zitong Chao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jishan Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Can Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.13766v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have made significant strides in mathematical reasoning, underscoring the need for a comprehensive and fair evaluation of their capabilities. However, existing benchmarks often fall short, either lacking extensive coverage of undergraduate-level mathematical problems or probably suffering from test-set contamination. To address these issues, we introduce UGMathBench, a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13766v1-abstract-full').style.display = 'inline'; document.getElementById('2501.13766v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.13766v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have made significant strides in mathematical reasoning, underscoring the need for a comprehensive and fair evaluation of their capabilities. However, existing benchmarks often fall short, either lacking extensive coverage of undergraduate-level mathematical problems or probably suffering from test-set contamination. To address these issues, we introduce UGMathBench, a diverse and dynamic benchmark specifically designed for evaluating undergraduate-level mathematical reasoning with LLMs. UGMathBench comprises 5,062 problems across 16 subjects and 111 topics, featuring 10 distinct answer types. Each problem includes three randomized versions, with additional versions planned for release as leading open-source LLMs become saturated in UGMathBench. Furthermore, we propose two key metrics: effective accuracy (EAcc), which measures the percentage of correctly solved problems across all three versions, and reasoning gap ($螖$), which assesses reasoning robustness by calculating the difference between the average accuracy across all versions and EAcc. Our extensive evaluation of 23 leading LLMs reveals that the highest EAcc achieved is 56.3\% by OpenAI-o1-mini, with large $螖$ values observed across different models. This highlights the need for future research aimed at developing "large reasoning models" with high EAcc and $螖= 0$. We anticipate that the release of UGMathBench, along with its detailed evaluation codes, will serve as a valuable resource to advance the development of LLMs in solving mathematical problems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.13766v1-abstract-full').style.display = 'none'; document.getElementById('2501.13766v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICLR 2025</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> International Conference on Learning Representations (ICLR 2025) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12610">arXiv:2501.12610</a> <span> [<a href="https://arxiv.org/pdf/2501.12610">pdf</a>, <a href="https://arxiv.org/format/2501.12610">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Exploring Wikipedia Gender Diversity Over Time $\unicode{x2013}$ The Wikipedia Gender Dashboard (WGD) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yunus%2C+Y">Yahya Yunus</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianwa Chen</a>, <a href="/search/cs?searchtype=author&query=Demartini%2C+G">Gianluca Demartini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12610v1-abstract-short" style="display: inline;"> The Wikipedia editors' community has been actively pursuing the intent of achieving gender equality. To that end, it is important to explore the historical evolution of underlying gender disparities in Wikipedia articles. This paper presents the Wikipedia Gender Dashboard (WGD), a tool designed to enable the interaction with gender distribution data, including the average age in every subclass of… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12610v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12610v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12610v1-abstract-full" style="display: none;"> The Wikipedia editors' community has been actively pursuing the intent of achieving gender equality. To that end, it is important to explore the historical evolution of underlying gender disparities in Wikipedia articles. This paper presents the Wikipedia Gender Dashboard (WGD), a tool designed to enable the interaction with gender distribution data, including the average age in every subclass of individuals (i.e. Astronauts, Politicians, etc.) over the years. Wikipedia APIs, DBpedia, and Wikidata endpoints were used to query the data to ensure persistent data collection. The WGD was then created with Microsoft Power BI before being embedded on a public website. The analysis of the data available in the WGD found that female articles only represent around 17% of English Wikipedia, but it has been growing steadily over the last 20 years. Meanwhile, the average age across genders decreased over time. WGD also shows that most subclasses of `Person' are male-dominated. Wikipedia editors can make use of WGD to locate areas with marginalized genders in Wikipedia, and increase their efforts to produce more content providing coverage for those genders to achieve better gender equality in Wikipedia. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12610v1-abstract-full').style.display = 'none'; document.getElementById('2501.12610v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">4 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12558">arXiv:2501.12558</a> <span> [<a href="https://arxiv.org/pdf/2501.12558">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Structural and mechanical properties of W-Cu compounds characterized by a neural-network-based potential </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianchuan Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+S">Sheng Mao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mohan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12558v2-abstract-short" style="display: inline;"> Tungsten-copper (W-Cu) compounds are widely utilized in various industrial fields due to their exceptional mechanical properties. In this study, we have developed a neural-network-based deep potential (DP) model that covers a wide range of temperatures, ranging from 0 to 3,000 K, and pressures, varying from 0 to 10 GPa. This study presents a model trained using density functional theory data for f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12558v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12558v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12558v2-abstract-full" style="display: none;"> Tungsten-copper (W-Cu) compounds are widely utilized in various industrial fields due to their exceptional mechanical properties. In this study, we have developed a neural-network-based deep potential (DP) model that covers a wide range of temperatures, ranging from 0 to 3,000 K, and pressures, varying from 0 to 10 GPa. This study presents a model trained using density functional theory data for full concentration CuxW100-x compounds. Through this model, we systematically investigate the structural and mechanical properties of W-Cu alloys and have the following findings. First, the bulk modulus (B) and Young's modulus (E) of W-Cu alloys exhibit a linear decline as the Cu content increases, indicating a softening trend in the CuxW100-x compounds as the Cu concentration rises. Second, a higher Cu content results in higher critical strain and lower critical stress for these compounds. A brittle-to-ductile transition in the deformation mode predicted is predicted at around 37.5 at. % Cu content. Third, tensile loading tests in the W-Cu gradient structure reveal that Cu-poor region serves as a barrier, hindering shear band propagation while promoting new shear band formation in the Cu-rich region. The above results from the DP model are anticipated to aid in exploring the physical mechanisms underlying the complex phenomena of W-Cu systems and contribute to the advancement of methodologies for materials simulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12558v2-abstract-full').style.display = 'none'; document.getElementById('2501.12558v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12461">arXiv:2501.12461</a> <span> [<a href="https://arxiv.org/pdf/2501.12461">pdf</a>, <a href="https://arxiv.org/format/2501.12461">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Empowering AIOps: Leveraging Large Language Models for IT Operations Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Vitui%2C+A">Arthur Vitui</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tse-Hsun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12461v2-abstract-short" style="display: inline;"> The integration of Artificial Intelligence (AI) into IT Operations Management (ITOM), commonly referred to as AIOps, offers substantial potential for automating workflows, enhancing efficiency, and supporting informed decision-making. However, implementing AI within IT operations is not without its challenges, including issues related to data quality, the complexity of IT environments, and skill g… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12461v2-abstract-full').style.display = 'inline'; document.getElementById('2501.12461v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12461v2-abstract-full" style="display: none;"> The integration of Artificial Intelligence (AI) into IT Operations Management (ITOM), commonly referred to as AIOps, offers substantial potential for automating workflows, enhancing efficiency, and supporting informed decision-making. However, implementing AI within IT operations is not without its challenges, including issues related to data quality, the complexity of IT environments, and skill gaps within teams. The advent of Large Language Models (LLMs) presents an opportunity to address some of these challenges, particularly through their advanced natural language understanding capabilities. These features enable organizations to process and analyze vast amounts of unstructured data, such as system logs, incident reports, and technical documentation. This ability aligns with the motivation behind our research, where we aim to integrate traditional predictive machine learning models with generative AI technologies like LLMs. By combining these approaches, we propose innovative methods to tackle persistent challenges in AIOps and enhance the capabilities of IT operations management. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12461v2-abstract-full').style.display = 'none'; document.getElementById('2501.12461v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.12053">arXiv:2501.12053</a> <span> [<a href="https://arxiv.org/pdf/2501.12053">pdf</a>, <a href="https://arxiv.org/format/2501.12053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> PINNsAgent: Automated PDE Surrogation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wuwu%2C+Q">Qingpo Wuwu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chonghan Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyu Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yihang Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuekai Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianing Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianxin Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+H">Haoyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shanghang Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.12053v1-abstract-short" style="display: inline;"> Solving partial differential equations (PDEs) using neural methods has been a long-standing scientific and engineering research pursuit. Physics-Informed Neural Networks (PINNs) have emerged as a promising alternative to traditional numerical methods for solving PDEs. However, the gap between domain-specific knowledge and deep learning expertise often limits the practical application of PINNs. Pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12053v1-abstract-full').style.display = 'inline'; document.getElementById('2501.12053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.12053v1-abstract-full" style="display: none;"> Solving partial differential equations (PDEs) using neural methods has been a long-standing scientific and engineering research pursuit. Physics-Informed Neural Networks (PINNs) have emerged as a promising alternative to traditional numerical methods for solving PDEs. However, the gap between domain-specific knowledge and deep learning expertise often limits the practical application of PINNs. Previous works typically involve manually conducting extensive PINNs experiments and summarizing heuristic rules for hyperparameter tuning. In this work, we introduce PINNsAgent, a novel surrogation framework that leverages large language models (LLMs) and utilizes PINNs as a foundation to bridge the gap between domain-specific knowledge and deep learning. Specifically, PINNsAgent integrates (1) Physics-Guided Knowledge Replay (PGKR), which encodes the essential characteristics of PDEs and their associated best-performing PINNs configurations into a structured format, enabling efficient knowledge transfer from solved PDEs to similar problems and (2) Memory Tree Reasoning, a strategy that effectively explores the search space for optimal PINNs architectures. By leveraging LLMs and exploration strategies, PINNsAgent enhances the automation and efficiency of PINNs-based solutions. We evaluate PINNsAgent on 14 benchmark PDEs, demonstrating its effectiveness in automating the surrogation process and significantly improving the accuracy of PINNs-based solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.12053v1-abstract-full').style.display = 'none'; document.getElementById('2501.12053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, 3 figures, 3 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.09555">arXiv:2501.09555</a> <span> [<a href="https://arxiv.org/pdf/2501.09555">pdf</a>, <a href="https://arxiv.org/format/2501.09555">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Text-driven Adaptation of Foundation Models for Few-shot Surgical Workflow Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tingxuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+K">Kun Yuan</a>, <a href="/search/cs?searchtype=author&query=Srivastav%2C+V">Vinkle Srivastav</a>, <a href="/search/cs?searchtype=author&query=Navab%2C+N">Nassir Navab</a>, <a href="/search/cs?searchtype=author&query=Padoy%2C+N">Nicolas Padoy</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.09555v2-abstract-short" style="display: inline;"> Purpose: Surgical workflow analysis is crucial for improving surgical efficiency and safety. However, previous studies rely heavily on large-scale annotated datasets, posing challenges in cost, scalability, and reliance on expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven Adaptation), designed to handle various surgical workflow analysis tasks with minimal paired imag… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09555v2-abstract-full').style.display = 'inline'; document.getElementById('2501.09555v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.09555v2-abstract-full" style="display: none;"> Purpose: Surgical workflow analysis is crucial for improving surgical efficiency and safety. However, previous studies rely heavily on large-scale annotated datasets, posing challenges in cost, scalability, and reliance on expert annotations. To address this, we propose Surg-FTDA (Few-shot Text-driven Adaptation), designed to handle various surgical workflow analysis tasks with minimal paired image-label data. Methods: Our approach has two key components. First, Few-shot selection-based modality alignment selects a small subset of images and aligns their embeddings with text embeddings from the downstream task, bridging the modality gap. Second, Text-driven adaptation leverages only text data to train a decoder, eliminating the need for paired image-text data. This decoder is then applied to aligned image embeddings, enabling image-related tasks without explicit image-text pairs. Results: We evaluate our approach to generative tasks (image captioning) and discriminative tasks (triplet recognition and phase recognition). Results show that Surg-FTDA outperforms baselines and generalizes well across downstream tasks. Conclusion: We propose a text-driven adaptation approach that mitigates the modality gap and handles multiple downstream tasks in surgical workflow analysis, with minimal reliance on large annotated datasets. The code and dataset will be released in https://github.com/CAMMA-public/Surg-FTDA <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.09555v2-abstract-full').style.display = 'none'; document.getElementById('2501.09555v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07468">arXiv:2501.07468</a> <span> [<a href="https://arxiv.org/pdf/2501.07468">pdf</a>, <a href="https://arxiv.org/format/2501.07468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> From Screens to Scenes: A Survey of Embodied AI in Healthcare </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yihao Liu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tingting Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yankai Jiang</a>, <a href="/search/cs?searchtype=author&query=You%2C+J">Junjie You</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Minghua Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaosong Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+M">Mengling Feng</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Y">Yaochu Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07468v2-abstract-short" style="display: inline;"> Healthcare systems worldwide face persistent challenges in efficiency, accessibility, and personalization. Powered by modern AI technologies such as multimodal large language models and world models, Embodied AI (EmAI) represents a transformative frontier, offering enhanced autonomy and the ability to interact with the physical world to address these challenges. As an interdisciplinary and rapidly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07468v2-abstract-full').style.display = 'inline'; document.getElementById('2501.07468v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07468v2-abstract-full" style="display: none;"> Healthcare systems worldwide face persistent challenges in efficiency, accessibility, and personalization. Powered by modern AI technologies such as multimodal large language models and world models, Embodied AI (EmAI) represents a transformative frontier, offering enhanced autonomy and the ability to interact with the physical world to address these challenges. As an interdisciplinary and rapidly evolving research domain, "EmAI in healthcare" spans diverse fields such as algorithms, robotics, and biomedicine. This complexity underscores the importance of timely reviews and analyses to track advancements, address challenges, and foster cross-disciplinary collaboration. In this paper, we provide a comprehensive overview of the "brain" of EmAI for healthcare, wherein we introduce foundational AI algorithms for perception, actuation, planning, and memory, and focus on presenting the healthcare applications spanning clinical interventions, daily care & companionship, infrastructure support, and biomedical research. Despite its promise, the development of EmAI for healthcare is hindered by critical challenges such as safety concerns, gaps between simulation platforms and real-world applications, the absence of standardized benchmarks, and uneven progress across interdisciplinary domains. We discuss the technical barriers and explore ethical considerations, offering a forward-looking perspective on the future of EmAI in healthcare. A hierarchical framework of intelligent levels for EmAI systems is also introduced to guide further development. By providing systematic insights, this work aims to inspire innovation and practical applications, paving the way for a new era of intelligent, patient-centered healthcare. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07468v2-abstract-full').style.display = 'none'; document.getElementById('2501.07468v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">58 pages, 11 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07400">arXiv:2501.07400</a> <span> [<a href="https://arxiv.org/pdf/2501.07400">pdf</a>, <a href="https://arxiv.org/ps/2501.07400">ps</a>, <a href="https://arxiv.org/format/2501.07400">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Analysis of PDEs">math.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Derivation of effective gradient flow equations and dynamical truncation of training data in Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Thomas Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07400v1-abstract-short" style="display: inline;"> We derive explicit equations governing the cumulative biases and weights in Deep Learning with ReLU activation function, based on gradient descent for the Euclidean cost in the input layer, and under the assumption that the weights are, in a precise sense, adapted to the coordinate system distinguished by the activations. We show that gradient descent corresponds to a dynamical process in the inpu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07400v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07400v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07400v1-abstract-full" style="display: none;"> We derive explicit equations governing the cumulative biases and weights in Deep Learning with ReLU activation function, based on gradient descent for the Euclidean cost in the input layer, and under the assumption that the weights are, in a precise sense, adapted to the coordinate system distinguished by the activations. We show that gradient descent corresponds to a dynamical process in the input layer, whereby clusters of data are progressively reduced in complexity ("truncated") at an exponential rate that increases with the number of data points that have already been truncated. We provide a detailed discussion of several types of solutions to the gradient flow equations. A main motivation for this work is to shed light on the interpretability question in supervised learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07400v1-abstract-full').style.display = 'none'; document.getElementById('2501.07400v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">AMS Latex, 35 pages</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 57R70; 62M45 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07236">arXiv:2501.07236</a> <span> [<a href="https://arxiv.org/pdf/2501.07236">pdf</a>, <a href="https://arxiv.org/format/2501.07236">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CSTA: Spatial-Temporal Causal Adaptive Learning for Exemplar-Free Video Class-Incremental Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huabin Liu</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+C+H">Chern Hong Lim</a>, <a href="/search/cs?searchtype=author&query=See%2C+J">John See</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xing Gao</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weiyao Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07236v1-abstract-short" style="display: inline;"> Continual learning aims to acquire new knowledge while retaining past information. Class-incremental learning (CIL) presents a challenging scenario where classes are introduced sequentially. For video data, the task becomes more complex than image data because it requires learning and preserving both spatial appearance and temporal action involvement. To address this challenge, we propose a novel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07236v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07236v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07236v1-abstract-full" style="display: none;"> Continual learning aims to acquire new knowledge while retaining past information. Class-incremental learning (CIL) presents a challenging scenario where classes are introduced sequentially. For video data, the task becomes more complex than image data because it requires learning and preserving both spatial appearance and temporal action involvement. To address this challenge, we propose a novel exemplar-free framework that equips separate spatiotemporal adapters to learn new class patterns, accommodating the incremental information representation requirements unique to each class. While separate adapters are proven to mitigate forgetting and fit unique requirements, naively applying them hinders the intrinsic connection between spatial and temporal information increments, affecting the efficiency of representing newly learned class information. Motivated by this, we introduce two key innovations from a causal perspective. First, a causal distillation module is devised to maintain the relation between spatial-temporal knowledge for a more efficient representation. Second, a causal compensation mechanism is proposed to reduce the conflicts during increment and memorization between different types of information. Extensive experiments conducted on benchmark datasets demonstrate that our framework can achieve new state-of-the-art results, surpassing current example-based methods by 4.2% in accuracy on average. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07236v1-abstract-full').style.display = 'none'; document.getElementById('2501.07236v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE TCSVT Submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07227">arXiv:2501.07227</a> <span> [<a href="https://arxiv.org/pdf/2501.07227">pdf</a>, <a href="https://arxiv.org/format/2501.07227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MECD+: Unlocking Event-Level Causal Graph Discovery for Video Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huabin Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yihang Chen</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tianyao He</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+C">Chaofan Gan</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Huanyu He</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Weiyao Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07227v3-abstract-short" style="display: inline;"> Video causal reasoning aims to achieve a high-level understanding of videos from a causal perspective. However, it exhibits limitations in its scope, primarily executed in a question-answering paradigm and focusing on brief video segments containing isolated events and basic causal relations, lacking comprehensive and structured causality analysis for videos with multiple interconnected events. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07227v3-abstract-full').style.display = 'inline'; document.getElementById('2501.07227v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07227v3-abstract-full" style="display: none;"> Video causal reasoning aims to achieve a high-level understanding of videos from a causal perspective. However, it exhibits limitations in its scope, primarily executed in a question-answering paradigm and focusing on brief video segments containing isolated events and basic causal relations, lacking comprehensive and structured causality analysis for videos with multiple interconnected events. To fill this gap, we introduce a new task and dataset, Multi-Event Causal Discovery (MECD). It aims to uncover the causal relations between events distributed chronologically across long videos. Given visual segments and textual descriptions of events, MECD identifies the causal associations between these events to derive a comprehensive and structured event-level video causal graph explaining why and how the result event occurred. To address the challenges of MECD, we devise a novel framework inspired by the Granger Causality method, incorporating an efficient mask-based event prediction model to perform an Event Granger Test. It estimates causality by comparing the predicted result event when premise events are masked versus unmasked. Furthermore, we integrate causal inference techniques such as front-door adjustment and counterfactual inference to mitigate challenges in MECD like causality confounding and illusory causality. Additionally, context chain reasoning is introduced to conduct more robust and generalized reasoning. Experiments validate the effectiveness of our framework in reasoning complete causal relations, outperforming GPT-4o and VideoChat2 by 5.77% and 2.70%, respectively. Further experiments demonstrate that causal relation graphs can also contribute to downstream video understanding tasks such as video question answering and video event prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07227v3-abstract-full').style.display = 'none'; document.getElementById('2501.07227v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">IEEE TPAMI Submission. continuous work of arXiv:2409.17647 (NeurIPS 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06468">arXiv:2501.06468</a> <span> [<a href="https://arxiv.org/pdf/2501.06468">pdf</a>, <a href="https://arxiv.org/format/2501.06468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> First Token Probability Guided RAG for Telecom Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tingwei Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zijian Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Haolong Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guangxu Zhu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06468v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have garnered significant attention for their impressive general-purpose capabilities. For applications requiring intricate domain knowledge, Retrieval-Augmented Generation (RAG) has shown a distinct advantage in incorporating domain-specific information into LLMs. However, existing RAG research has not fully addressed the challenges of Multiple Choice Question Answeri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06468v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06468v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have garnered significant attention for their impressive general-purpose capabilities. For applications requiring intricate domain knowledge, Retrieval-Augmented Generation (RAG) has shown a distinct advantage in incorporating domain-specific information into LLMs. However, existing RAG research has not fully addressed the challenges of Multiple Choice Question Answering (MCQA) in telecommunications, particularly in terms of retrieval quality and mitigating hallucinations. To tackle these challenges, we propose a novel first token probability guided RAG framework. This framework leverages confidence scores to optimize key hyperparameters, such as chunk number and chunk window size, while dynamically adjusting the context. Our method starts by retrieving the most relevant chunks and generates a single token as the potential answer. The probabilities of all options are then normalized to serve as confidence scores, which guide the dynamic adjustment of the context. By iteratively optimizing the hyperparameters based on these confidence scores, we can continuously improve RAG performance. We conducted experiments to validate the effectiveness of our framework, demonstrating its potential to enhance accuracy in domain-specific MCQA tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06468v1-abstract-full').style.display = 'none'; document.getElementById('2501.06468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06459">arXiv:2501.06459</a> <span> [<a href="https://arxiv.org/pdf/2501.06459">pdf</a>, <a href="https://arxiv.org/format/2501.06459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Enhancing The Open Network: Definition and Automated Detection of Smart Contract Defects </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+H">Hao Song</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Teng Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiachi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Beibei Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhangyan Lin</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yi Lu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+P">Pan Li</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xihan Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06459v1-abstract-short" style="display: inline;"> The Open Network (TON), designed to support Telegram's extensive user base of hundreds of millions, has garnered considerable attention since its launch in 2022. FunC is the most popular programming language for writing smart contracts on TON. It is distinguished by a unique syntax compared to other smart contract languages. Despite growing interest, research on the practical defects of TON smart… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06459v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06459v1-abstract-full" style="display: none;"> The Open Network (TON), designed to support Telegram's extensive user base of hundreds of millions, has garnered considerable attention since its launch in 2022. FunC is the most popular programming language for writing smart contracts on TON. It is distinguished by a unique syntax compared to other smart contract languages. Despite growing interest, research on the practical defects of TON smart contracts is still in its early stages. In this paper, we summarize eight smart contract defects identified from TON's official blogs and audit reports, each with detailed definitions and code examples. Furthermore, we propose a static analysis framework called TONScanner to facilitate the detection of these defects. Specifically, TONScanner reuses FunC compiler's frontend code to transform the FunC source code into FunC intermediate representation (IR) in the form of a directed acyclic graph (DAG). Based on this IR, TONScanner constructs a control flow graph (CFG), then transforms it into a static single assignment (SSA) form to simplify further analysis. TONScanner also integrates Data Dependency, Call Graph, Taint Analysis, and Cell Construct, which are specifically tailored for TON blockchain's unique data structures. These components finally facilitate the identification of the eight defects. We evaluate the effectiveness of TONScanner by applying it to 1,640 smart contracts and find a total of 14,995 defects. Through random sampling and manual labeling, we find that TONScanner achieves an overall precision of 97.49%. The results reveal that current TON contracts contain numerous defects, indicating that developers are prone to making errors. TONScanner has proven its ability to accurately identify these defects, thereby aiding in their correction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06459v1-abstract-full').style.display = 'none'; document.getElementById('2501.06459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted for presentation at the 47th IEEE/ACM International Conference on Software Engineering (ICSE 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.06187">arXiv:2501.06187</a> <span> [<a href="https://arxiv.org/pdf/2501.06187">pdf</a>, <a href="https://arxiv.org/format/2501.06187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-subject Open-set Personalization in Video Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tsai-Shien Chen</a>, <a href="/search/cs?searchtype=author&query=Siarohin%2C+A">Aliaksandr Siarohin</a>, <a href="/search/cs?searchtype=author&query=Menapace%2C+W">Willi Menapace</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+Y">Yuwei Fang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+K+S">Kwot Sin Lee</a>, <a href="/search/cs?searchtype=author&query=Skorokhodov%2C+I">Ivan Skorokhodov</a>, <a href="/search/cs?searchtype=author&query=Aberman%2C+K">Kfir Aberman</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun-Yan Zhu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a>, <a href="/search/cs?searchtype=author&query=Tulyakov%2C+S">Sergey Tulyakov</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.06187v1-abstract-short" style="display: inline;"> Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist $-$ a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06187v1-abstract-full').style.display = 'inline'; document.getElementById('2501.06187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.06187v1-abstract-full" style="display: none;"> Video personalization methods allow us to synthesize videos with specific concepts such as people, pets, and places. However, existing methods often focus on limited domains, require time-consuming optimization per subject, or support only a single subject. We present Video Alchemist $-$ a video model with built-in multi-subject, open-set personalization capabilities for both foreground objects and background, eliminating the need for time-consuming test-time optimization. Our model is built on a new Diffusion Transformer module that fuses each conditional reference image and its corresponding subject-level text prompt with cross-attention layers. Developing such a large model presents two main challenges: dataset and evaluation. First, as paired datasets of reference images and videos are extremely hard to collect, we sample selected video frames as reference images and synthesize a clip of the target video. However, while models can easily denoise training videos given reference frames, they fail to generalize to new contexts. To mitigate this issue, we design a new automatic data construction pipeline with extensive image augmentations. Second, evaluating open-set video personalization is a challenge in itself. To address this, we introduce a personalization benchmark that focuses on accurate subject fidelity and supports diverse personalization scenarios. Finally, our extensive experiments show that our method significantly outperforms existing personalization methods in both quantitative and qualitative evaluations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.06187v1-abstract-full').style.display = 'none'; document.getElementById('2501.06187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://snap-research.github.io/open-set-video-personalization/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05793">arXiv:2501.05793</a> <span> [<a href="https://arxiv.org/pdf/2501.05793">pdf</a>, <a href="https://arxiv.org/format/2501.05793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> ActMiner: Applying Causality Tracking and Increment Aligning for Graph-based Cyber Threat Hunting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+M">Mingjun Ma</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+T">Tiantian Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tieming Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuang Li</a>, <a href="/search/cs?searchtype=author&query=Ying%2C+J">Jie Ying</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Chunlin Xiong</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+M">Mingqi Lv</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05793v1-abstract-short" style="display: inline;"> To defend against Advanced Persistent Threats on the endpoint, threat hunting employs security knowledge such as cyber threat intelligence to continuously analyze system audit logs through retrospective scanning, querying, or pattern matching, aiming to uncover attack patterns/graphs that traditional detection methods (e.g., recognition for Point of Interest) fail to capture. However, existing thr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05793v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05793v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05793v1-abstract-full" style="display: none;"> To defend against Advanced Persistent Threats on the endpoint, threat hunting employs security knowledge such as cyber threat intelligence to continuously analyze system audit logs through retrospective scanning, querying, or pattern matching, aiming to uncover attack patterns/graphs that traditional detection methods (e.g., recognition for Point of Interest) fail to capture. However, existing threat hunting systems based on provenance graphs face challenges of high false negatives, high false positives, and low efficiency when confronted with diverse attack tactics and voluminous audit logs. To address these issues, we propose a system called Actminer, which constructs query graphs from descriptive relationships in cyber threat intelligence reports for precise threat hunting (i.e., graph alignment) on provenance graphs. First, we present a heuristic search strategy based on equivalent semantic transfer to reduce false negatives. Second, we establish a filtering mechanism based on causal relationships of attack behaviors to mitigate false positives. Finally, we design a tree structure to incrementally update the alignment results, significantly improving hunting efficiency. Evaluation on the DARPA Engagement dataset demonstrates that compared to the SOTA POIROT, Actminer reduces false positives by 39.1%, eliminates all false negatives, and effectively counters adversarial attacks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05793v1-abstract-full').style.display = 'none'; document.getElementById('2501.05793v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.05176">arXiv:2501.05176</a> <span> [<a href="https://arxiv.org/pdf/2501.05176">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Deep Assessment of Code Review Generation Approaches: Beyond Lexical Similarity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yanjie Jiang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyi Chen</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+F">Fu Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+C">Chunhao Dong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kui Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.05176v1-abstract-short" style="display: inline;"> Code review is a standard practice for ensuring the quality of software projects, and recent research has focused extensively on automated code review. While significant advancements have been made in generating code reviews, the automated assessment of these reviews remains less explored, with existing approaches and metrics often proving inaccurate. Current metrics, such as BLEU, primarily rely… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05176v1-abstract-full').style.display = 'inline'; document.getElementById('2501.05176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.05176v1-abstract-full" style="display: none;"> Code review is a standard practice for ensuring the quality of software projects, and recent research has focused extensively on automated code review. While significant advancements have been made in generating code reviews, the automated assessment of these reviews remains less explored, with existing approaches and metrics often proving inaccurate. Current metrics, such as BLEU, primarily rely on lexical similarity between generated and reference reviews. However, such metrics tend to underestimate reviews that articulate the expected issues in ways different from the references. In this paper, we explore how semantic similarity between generated and reference reviews can enhance the automated assessment of code reviews. We first present a benchmark called \textit{GradedReviews}, which is constructed by collecting real-world code reviews from open-source projects, generating reviews using state-of-the-art approaches, and manually assessing their quality. We then evaluate existing metrics for code review assessment using this benchmark, revealing their limitations. To address these limitations, we propose two novel semantic-based approaches for assessing code reviews. The first approach involves converting both the generated review and its reference into digital vectors using a deep learning model and then measuring their semantic similarity through Cosine similarity. The second approach generates a prompt based on the generated review and its reference, submits this prompt to ChatGPT, and requests ChatGPT to rate the generated review according to explicitly defined criteria. Our evaluation on the \textit{GradedReviews} benchmark indicates that the proposed semantic-based approaches significantly outperform existing state-of-the-art metrics in assessing generated code review, improving the correlation coefficient between the resulting scores and human scores from 0.22 to 0.47. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.05176v1-abstract-full').style.display = 'none'; document.getElementById('2501.05176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04958">arXiv:2501.04958</a> <span> [<a href="https://arxiv.org/pdf/2501.04958">pdf</a>, <a href="https://arxiv.org/format/2501.04958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Addressing Domain Shift via Imbalance-Aware Domain Adaptation in Embryo Development Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xinglin Zhang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jun Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04958v1-abstract-short" style="display: inline;"> Deep learning models in medical imaging face dual challenges: domain shift, where models perform poorly when deployed in settings different from their training environment, and class imbalance, where certain disease conditions are naturally underrepresented. We present Imbalance-Aware Domain Adaptation (IADA), a novel framework that simultaneously tackles both challenges through three key componen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04958v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04958v1-abstract-full" style="display: none;"> Deep learning models in medical imaging face dual challenges: domain shift, where models perform poorly when deployed in settings different from their training environment, and class imbalance, where certain disease conditions are naturally underrepresented. We present Imbalance-Aware Domain Adaptation (IADA), a novel framework that simultaneously tackles both challenges through three key components: (1) adaptive feature learning with class-specific attention mechanisms, (2) balanced domain alignment with dynamic weighting, and (3) adaptive threshold optimization. Our theoretical analysis establishes convergence guarantees and complexity bounds. Through extensive experiments on embryo development assessment across four imaging modalities, IADA demonstrates significant improvements over existing methods, achieving up to 25.19\% higher accuracy while maintaining balanced performance across classes. In challenging scenarios with low-quality imaging systems, IADA shows robust generalization with AUC improvements of up to 12.56\%. These results demonstrate IADA's potential for developing reliable and equitable medical imaging systems for diverse clinical settings. The code is made public available at \url{https://github.com/yinghemedical/imbalance-aware_domain_adaptation} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04958v1-abstract-full').style.display = 'none'; document.getElementById('2501.04958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04359">arXiv:2501.04359</a> <span> [<a href="https://arxiv.org/pdf/2501.04359">pdf</a>, <a href="https://arxiv.org/format/2501.04359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Decoding EEG Speech Perception with Transformers and VAE-based Data Augmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T+Y">Terrance Yu-Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yulin Chen</a>, <a href="/search/cs?searchtype=author&query=Soederhaell%2C+P">Pontus Soederhaell</a>, <a href="/search/cs?searchtype=author&query=Agrawal%2C+S">Sadrishya Agrawal</a>, <a href="/search/cs?searchtype=author&query=Shapovalenko%2C+K">Kateryna Shapovalenko</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04359v1-abstract-short" style="display: inline;"> Decoding speech from non-invasive brain signals, such as electroencephalography (EEG), has the potential to advance brain-computer interfaces (BCIs), with applications in silent communication and assistive technologies for individuals with speech impairments. However, EEG-based speech decoding faces major challenges, such as noisy data, limited datasets, and poor performance on complex tasks like… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04359v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04359v1-abstract-full" style="display: none;"> Decoding speech from non-invasive brain signals, such as electroencephalography (EEG), has the potential to advance brain-computer interfaces (BCIs), with applications in silent communication and assistive technologies for individuals with speech impairments. However, EEG-based speech decoding faces major challenges, such as noisy data, limited datasets, and poor performance on complex tasks like speech perception. This study attempts to address these challenges by employing variational autoencoders (VAEs) for EEG data augmentation to improve data quality and applying a state-of-the-art (SOTA) sequence-to-sequence deep learning architecture, originally successful in electromyography (EMG) tasks, to EEG-based speech decoding. Additionally, we adapt this architecture for word classification tasks. Using the Brennan dataset, which contains EEG recordings of subjects listening to narrated speech, we preprocess the data and evaluate both classification and sequence-to-sequence models for EEG-to-words/sentences tasks. Our experiments show that VAEs have the potential to reconstruct artificial EEG data for augmentation. Meanwhile, our sequence-to-sequence model achieves more promising performance in generating sentences compared to our classification model, though both remain challenging tasks. These findings lay the groundwork for future research on EEG speech perception decoding, with possible extensions to speech production tasks such as silent or imagined speech. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04359v1-abstract-full').style.display = 'none'; document.getElementById('2501.04359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 15 figures, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 92C55 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> H.5.2; I.2.6; J.3 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.04167">arXiv:2501.04167</a> <span> [<a href="https://arxiv.org/pdf/2501.04167">pdf</a>, <a href="https://arxiv.org/format/2501.04167">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Reasoning-Enhanced Self-Training for Long-Form Personalized Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Salemi%2C+A">Alireza Salemi</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Cheng Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingyang Zhang</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+Q">Qiaozhu Mei</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+W">Weize Kong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhuowan Li</a>, <a href="/search/cs?searchtype=author&query=Bendersky%2C+M">Michael Bendersky</a>, <a href="/search/cs?searchtype=author&query=Zamani%2C+H">Hamed Zamani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.04167v1-abstract-short" style="display: inline;"> Personalized text generation requires a unique ability of large language models (LLMs) to learn from context that they often do not encounter during their standard training. One way to encourage LLMs to better use personalized context for generating outputs that better align with the user's expectations is to instruct them to reason over the user's past preferences, background knowledge, or writin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04167v1-abstract-full').style.display = 'inline'; document.getElementById('2501.04167v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.04167v1-abstract-full" style="display: none;"> Personalized text generation requires a unique ability of large language models (LLMs) to learn from context that they often do not encounter during their standard training. One way to encourage LLMs to better use personalized context for generating outputs that better align with the user's expectations is to instruct them to reason over the user's past preferences, background knowledge, or writing style. To achieve this, we propose Reasoning-Enhanced Self-Training for Personalized Text Generation (REST-PG), a framework that trains LLMs to reason over personal data during response generation. REST-PG first generates reasoning paths to train the LLM's reasoning abilities and then employs Expectation-Maximization Reinforced Self-Training to iteratively train the LLM based on its own high-reward outputs. We evaluate REST-PG on the LongLaMP benchmark, consisting of four diverse personalized long-form text generation tasks. Our experiments demonstrate that REST-PG achieves significant improvements over state-of-the-art baselines, with an average relative performance gain of 14.5% on the benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.04167v1-abstract-full').style.display = 'none'; document.getElementById('2501.04167v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03916">arXiv:2501.03916</a> <span> [<a href="https://arxiv.org/pdf/2501.03916">pdf</a>, <a href="https://arxiv.org/format/2501.03916">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Dolphin: Closed-loop Open-ended Auto-research through Thinking, Practice, and Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiakang Yuan</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiangchao Yan</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tao Chen</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+L">Lei Bai</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+B">Bowen Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03916v2-abstract-short" style="display: inline;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'inline'; document.getElementById('2501.03916v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03916v2-abstract-full" style="display: none;"> The scientific research paradigm is undergoing a profound transformation owing to the development of Artificial Intelligence (AI). Recent works demonstrate that various AI-assisted research methods can largely improve research efficiency by improving data analysis, accelerating computation, and fostering novel idea generation. To further move towards the ultimate goal (i.e., automatic scientific research), in this paper, we propose Dolphin, the first closed-loop open-ended auto-research framework to further build the entire process of human scientific research. Dolphin can generate research ideas, perform experiments, and get feedback from experimental results to generate higher-quality ideas. More specifically, Dolphin first generates novel ideas based on relevant papers which are ranked by the topic and task attributes. Then, the codes are automatically generated and debugged with the exception-traceback-guided local code structure. Finally, Dolphin automatically analyzes the results of each idea and feeds the results back to the next round of idea generation. Experiments are conducted on the benchmark datasets of different topics and results show that Dolphin can generate novel ideas continuously and complete the experiment in a loop. We highlight that Dolphin can automatically propose methods that are comparable to the state-of-the-art in some tasks such as 2D image classification and 3D point classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03916v2-abstract-full').style.display = 'none'; document.getElementById('2501.03916v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 11 figures, and our homepage: https://alpha-innovator.github.io/Dolphin-project-page</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03058">arXiv:2501.03058</a> <span> [<a href="https://arxiv.org/pdf/2501.03058">pdf</a>, <a href="https://arxiv.org/ps/2501.03058">ps</a>, <a href="https://arxiv.org/format/2501.03058">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Survival Analysis Revisited: Understanding and Unifying Poisson, Exponential, and Cox Models in Fall Risk Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianhua Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03058v1-abstract-short" style="display: inline;"> This paper explores foundational and applied aspects of survival analysis, using fall risk assessment as a case study. It revisits key time-related probability distributions and statistical methods, including logistic regression, Poisson regression, Exponential regression, and the Cox Proportional Hazards model, offering a unified perspective on their relationships within the survival analysis fra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03058v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03058v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03058v1-abstract-full" style="display: none;"> This paper explores foundational and applied aspects of survival analysis, using fall risk assessment as a case study. It revisits key time-related probability distributions and statistical methods, including logistic regression, Poisson regression, Exponential regression, and the Cox Proportional Hazards model, offering a unified perspective on their relationships within the survival analysis framework. A contribution of this work is the step-by-step derivation and clarification of the relationships among these models, particularly demonstrating that Poisson regression in the survival context is a specific case of the Cox model. These insights address gaps in understanding and reinforce the simplicity and interpretability of survival models. The paper also emphasizes the practical utility of survival analysis by connecting theoretical insights with real-world applications. In the context of fall detection, it demonstrates how these models can simultaneously predict fall risk, analyze contributing factors, and estimate time-to-event outcomes within a single streamlined framework. In contrast, advanced deep learning methods often require complex post-hoc interpretation and separate training for different tasks particularly when working with structured numerical data. This highlights the enduring relevance of classical statistical frameworks and makes survival models especially valuable in healthcare settings, where explainability and robustness are critical. By unifying foundational concepts and offering a cohesive perspective on time-to-event analysis, this work serves as an accessible resource for understanding survival models and applying them effectively to diverse analytical challenges. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03058v1-abstract-full').style.display = 'none'; document.getElementById('2501.03058v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02901">arXiv:2501.02901</a> <span> [<a href="https://arxiv.org/pdf/2501.02901">pdf</a>, <a href="https://arxiv.org/format/2501.02901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> DeCon: Detecting Incorrect Assertions via Postconditions Generated by a Large Language Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+H">Hao Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianyu Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiaming Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongyang Li</a>, <a href="/search/cs?searchtype=author&query=Ran%2C+D">Dezhi Ran</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Ying Li</a>, <a href="/search/cs?searchtype=author&query=Marron%2C+A">Assaf Marron</a>, <a href="/search/cs?searchtype=author&query=Harel%2C+D">David Harel</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuan Xie</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02901v1-abstract-short" style="display: inline;"> Recently, given the docstring for the target problem and the target function signature, large language models (LLMs) have been used not only to generate source code, but also to generate test cases, consisting of test inputs and assertions (e.g., in the form of checking an actual output against the expected output). However, as shown by our empirical study on assertions generated by four LLMs for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02901v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02901v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02901v1-abstract-full" style="display: none;"> Recently, given the docstring for the target problem and the target function signature, large language models (LLMs) have been used not only to generate source code, but also to generate test cases, consisting of test inputs and assertions (e.g., in the form of checking an actual output against the expected output). However, as shown by our empirical study on assertions generated by four LLMs for the HumanEval benchmark, over 62% of the generated assertions are incorrect (i.e., failed on the ground-truth problem solution). To detect incorrect assertions (given the docstring and the target function signature along with a sample of example inputs and outputs), in this paper, we propose a new approach named DeCon to effectively detect incorrect assertions via LLM-generated postconditions for the target problem (a postcondition is a predicate that must always be true just after the execution of the ground-truth problem solution). Our approach requires a small set of I/O examples (i.e., a sample of example inputs and outputs) for the target problem (e.g., the I/O examples included in the docstring for a target problem in HumanEval). We use the given I/O examples to filter out those LLM-generated postconditions that are violated by at least one given I/O example. We then use the remaining postconditions to detect incorrect assertions as those assertions that violate at least one remaining postcondition. Experimental results show that DeCon can detect averagely more than 64% (63% and 65.5% detected by GPT-3.5 and GPT-4, respectively) incorrect assertions generated by four state-of-the-art LLMs, and DeCon can also improve the effectiveness of these LLMs in code generation by 4% in terms of Pass@1. In addition, although DeCon might filter out correct assertions, the fault-finding ability of the remaining correct assertions decreases only slightly. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02901v1-abstract-full').style.display = 'none'; document.getElementById('2501.02901v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+T&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+T&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository