Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 557 results for author: <span class="mathjax">Su, Y</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Su%2C+Y">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Su, Y"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Su%2C+Y&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Su, Y"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Su%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18290">arXiv:2411.18290</a> <span> [<a href="https://arxiv.org/pdf/2411.18290">pdf</a>, <a href="https://arxiv.org/format/2411.18290">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Semantic Asymmetry for Precise Gross Tumor Volume Segmentation of Nasopharyngeal Carcinoma in Planning CT </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zi Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeli Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+T">Tai Ma</a>, <a href="/search/cs?searchtype=author&query=Mok%2C+T+C+W">Tony C. W. Mok</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yan-Jie Zhou</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunhai Bai</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhinlin Zheng</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Le Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yirui Wang</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+J">Jia Ge</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+X">Xianghua Ye</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+S">Senxiang Yan</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Dakai Jin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18290v1-abstract-short" style="display: inline;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. %… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18290v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18290v1-abstract-full" style="display: none;"> In the radiation therapy of nasopharyngeal carcinoma (NPC), clinicians typically delineate the gross tumor volume (GTV) using non-contrast planning computed tomography to ensure accurate radiation dose delivery. However, the low contrast between tumors and adjacent normal tissues necessitates that radiation oncologists manually delineate the tumors, often relying on diagnostic MRI for guidance. % In this study, we propose a novel approach to directly segment NPC gross tumors on non-contrast planning CT images, circumventing potential registration errors when aligning MRI or MRI-derived tumor masks to planning CT. To address the low contrast issues between tumors and adjacent normal structures in planning CT, we introduce a 3D Semantic Asymmetry Tumor segmentation (SATs) method. Specifically, we posit that a healthy nasopharyngeal region is characteristically bilaterally symmetric, whereas the emergence of nasopharyngeal carcinoma disrupts this symmetry. Then, we propose a Siamese contrastive learning segmentation framework that minimizes the voxel-wise distance between original and flipped areas without tumor and encourages a larger distance between original and flipped areas with tumor. Thus, our approach enhances the sensitivity of features to semantic asymmetries. % Extensive experiments demonstrate that the proposed SATs achieves the leading NPC GTV segmentation performance in both internal and external testing, \emph{e.g.}, with at least 2\% absolute Dice score improvement and 12\% average distance error reduction when compared to other state-of-the-art methods in the external testing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18290v1-abstract-full').style.display = 'none'; document.getElementById('2411.18290v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16537">arXiv:2411.16537</a> <span> [<a href="https://arxiv.org/pdf/2411.16537">pdf</a>, <a href="https://arxiv.org/format/2411.16537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> RoboSpatial: Teaching Spatial Understanding to 2D and 3D Vision-Language Models for Robotics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+C+H">Chan Hee Song</a>, <a href="/search/cs?searchtype=author&query=Blukis%2C+V">Valts Blukis</a>, <a href="/search/cs?searchtype=author&query=Tremblay%2C+J">Jonathan Tremblay</a>, <a href="/search/cs?searchtype=author&query=Tyree%2C+S">Stephen Tyree</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Birchfield%2C+S">Stan Birchfield</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16537v1-abstract-short" style="display: inline;"> Spatial understanding is a crucial capability for robots to make grounded decisions based on their environment. This foundational skill enables robots not only to perceive their surroundings but also to reason about and interact meaningfully within the world. In modern robotics, these capabilities are taken on by visual language models, and they face significant challenges when applied to spatial… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16537v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16537v1-abstract-full" style="display: none;"> Spatial understanding is a crucial capability for robots to make grounded decisions based on their environment. This foundational skill enables robots not only to perceive their surroundings but also to reason about and interact meaningfully within the world. In modern robotics, these capabilities are taken on by visual language models, and they face significant challenges when applied to spatial reasoning context due to their training data sources. These sources utilize general-purpose image datasets, and they often lack sophisticated spatial scene understanding capabilities. For example, the datasets do not address reference frame comprehension - spatial relationships require clear contextual understanding, whether from an ego-centric, object-centric, or world-centric perspective, which allow for effective real-world interaction. To address this issue, we introduce RoboSpatial, a large-scale spatial understanding dataset consisting of real indoor and tabletop scenes captured as 3D scans and egocentric images, annotated with rich spatial information relevant to robotics. The dataset includes 1M images, 5K 3D scans, and 3M annotated spatial relationships, with paired 2D egocentric images and 3D scans to make it both 2D and 3D ready. Our experiments show that models trained with RoboSpatial outperform baselines on downstream tasks such as spatial affordance prediction, spatial relationship prediction, and robotics manipulation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16537v1-abstract-full').style.display = 'none'; document.getElementById('2411.16537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16253">arXiv:2411.16253</a> <span> [<a href="https://arxiv.org/pdf/2411.16253">pdf</a>, <a href="https://arxiv.org/format/2411.16253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Open-Vocabulary Octree-Graph for 3D Scene Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhigang Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yifei Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chenhui Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dong Wang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+B">Bin Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xuelong Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16253v1-abstract-short" style="display: inline;"> Open-vocabulary 3D scene understanding is indispensable for embodied agents. Recent works leverage pretrained vision-language models (VLMs) for object segmentation and project them to point clouds to build 3D maps. Despite progress, a point cloud is a set of unordered coordinates that requires substantial storage space and does not directly convey occupancy information or spatial relation, making… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16253v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16253v1-abstract-full" style="display: none;"> Open-vocabulary 3D scene understanding is indispensable for embodied agents. Recent works leverage pretrained vision-language models (VLMs) for object segmentation and project them to point clouds to build 3D maps. Despite progress, a point cloud is a set of unordered coordinates that requires substantial storage space and does not directly convey occupancy information or spatial relation, making existing methods inefficient for downstream tasks, e.g., path planning and complex text-based object retrieval. To address these issues, we propose Octree-Graph, a novel scene representation for open-vocabulary 3D scene understanding. Specifically, a Chronological Group-wise Segment Merging (CGSM) strategy and an Instance Feature Aggregation (IFA) algorithm are first designed to get 3D instances and corresponding semantic features. Subsequently, an adaptive-octree structure is developed that stores semantics and depicts the occupancy of an object adjustably according to its shape. Finally, the Octree-Graph is constructed where each adaptive-octree acts as a graph node, and edges describe the spatial relations among nodes. Extensive experiments on various tasks are conducted on several widely-used datasets, demonstrating the versatility and effectiveness of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16253v1-abstract-full').style.display = 'none'; document.getElementById('2411.16253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11pages,7figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15526">arXiv:2411.15526</a> <span> [<a href="https://arxiv.org/pdf/2411.15526">pdf</a>, <a href="https://arxiv.org/format/2411.15526">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-scale Cascaded Large-Model for Whole-body ROI Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hao%2C+R">Rui Hao</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+D">Dayu Tan</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yansen Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunhou Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15526v1-abstract-short" style="display: inline;"> Organs-at-risk segmentation is critical for ensuring the safety and precision of radiotherapy and surgical procedures. However, existing methods for organs-at-risk image segmentation often suffer from uncertainties and biases in target selection, as well as insufficient model validation experiments, limiting their generality and reliability in practical applications. To address these issues, we pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15526v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15526v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15526v1-abstract-full" style="display: none;"> Organs-at-risk segmentation is critical for ensuring the safety and precision of radiotherapy and surgical procedures. However, existing methods for organs-at-risk image segmentation often suffer from uncertainties and biases in target selection, as well as insufficient model validation experiments, limiting their generality and reliability in practical applications. To address these issues, we propose an innovative cascaded network architecture called the Multi-scale Cascaded Fusing Network (MCFNet), which effectively captures complex multi-scale and multi-resolution features. MCFNet includes a Sharp Extraction Backbone and a Flexible Connection Backbone, which respectively enhance feature extraction in the downsampling and skip-connection stages. This design not only improves segmentation accuracy but also ensures computational efficiency, enabling precise detail capture even in low-resolution images. We conduct experiments using the A6000 GPU on diverse datasets from 671 patients, including 36,131 image-mask pairs across 10 different datasets. MCFNet demonstrates strong robustness, performing consistently well across 10 datasets. Additionally, MCFNet exhibits excellent generalizability, maintaining high accuracy in different clinical scenarios. We also introduce an adaptive loss aggregation strategy to further optimize the model training process, improving both segmentation accuracy and efficiency. Through extensive validation, MCFNet demonstrates superior performance compared to existing methods, providing more reliable image-guided support. Our solution aims to significantly improve the precision and safety of radiotherapy and surgical procedures, advancing personalized treatment. The code has been made available on GitHub:https://github.com/Henry991115/MCFNet. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15526v1-abstract-full').style.display = 'none'; document.getElementById('2411.15526v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15183">arXiv:2411.15183</a> <span> [<a href="https://arxiv.org/pdf/2411.15183">pdf</a>, <a href="https://arxiv.org/format/2411.15183">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Chemical Physics">physics.chem-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Balancing property optimization and constraint satisfaction for constrained multi-property molecular optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xia%2C+X">Xin Xia</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yajie Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+X">Xiangxiang Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+C">Chunhou Zheng</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yansen Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15183v1-abstract-short" style="display: inline;"> Molecular optimization, which aims to discover improved molecules from a vast chemical search space, is a critical step in chemical development. Various artificial intelligence technologies have demonstrated high effectiveness and efficiency on molecular optimization tasks. However, few of these technologies focus on balancing property optimization with constraint satisfaction, making it difficult… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15183v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15183v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15183v1-abstract-full" style="display: none;"> Molecular optimization, which aims to discover improved molecules from a vast chemical search space, is a critical step in chemical development. Various artificial intelligence technologies have demonstrated high effectiveness and efficiency on molecular optimization tasks. However, few of these technologies focus on balancing property optimization with constraint satisfaction, making it difficult to obtain high-quality molecules that not only possess desirable properties but also meet various constraints. To address this issue, we propose a constrained multi-property molecular optimization framework (CMOMO), which is a flexible and efficient method to simultaneously optimize multiple molecular properties while satisfying several drug-like constraints. CMOMO improves multiple properties of molecules with constraints based on dynamic cooperative optimization, which dynamically handles the constraints across various scenarios. Besides, CMOMO evaluates multiple properties within discrete chemical spaces cooperatively with the evolution of molecules within an implicit molecular space to guide the evolutionary search. Experimental results show the superior performance of the proposed CMOMO over five state-of-the-art molecular optimization methods on two benchmark tasks of simultaneously optimizing multiple non-biological activity properties while satisfying two structural constraints. Furthermore, the practical applicability of CMOMO is verified on two practical tasks, where it identified a collection of candidate ligands of $尾$2-adrenoceptor GPCR and candidate inhibitors of glycogen synthase kinase-3$尾$ with high properties and under drug-like constraints. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15183v1-abstract-full').style.display = 'none'; document.getElementById('2411.15183v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14735">arXiv:2411.14735</a> <span> [<a href="https://arxiv.org/pdf/2411.14735">pdf</a>, <a href="https://arxiv.org/format/2411.14735">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Programming Languages">cs.PL</span> </div> </div> <p class="title is-5 mathjax"> Automatic Inference of Relational Object Invariants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yusen Su</a>, <a href="/search/cs?searchtype=author&query=Navas%2C+J+A">Jorge A. Navas</a>, <a href="/search/cs?searchtype=author&query=Gurfinkel%2C+A">Arie Gurfinkel</a>, <a href="/search/cs?searchtype=author&query=Garcia-Contreras%2C+I">Isabel Garcia-Contreras</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14735v1-abstract-short" style="display: inline;"> Relational object invariants (or representation invariants) are relational properties held by the fields of a (memory) object throughout its lifetime. For example, the length of a buffer never exceeds its capacity. Automatic inference of these invariants is particularly challenging because they are often broken temporarily during field updates. In this paper, we present an Abstract Interpretation-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14735v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14735v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14735v1-abstract-full" style="display: none;"> Relational object invariants (or representation invariants) are relational properties held by the fields of a (memory) object throughout its lifetime. For example, the length of a buffer never exceeds its capacity. Automatic inference of these invariants is particularly challenging because they are often broken temporarily during field updates. In this paper, we present an Abstract Interpretation-based solution to infer object invariants. Our key insight is a new object abstraction for memory objects, where memory is divided into multiple memory banks, each containing several objects. Within each bank, the objects are further abstracted by separating the most recently used (MRU) object, represented precisely with strong updates, while the rest are summarized. For an effective implementation of this approach, we introduce a new composite abstract domain, which forms a reduced product of numerical and equality sub-domains. This design efficiently expresses relationships between a small number of variables (e.g., fields of the same abstract object). We implement the new domain in the CRAB abstract interpreter and evaluate it on several benchmarks for memory safety. We show that our approach is significantly more scalable for relational properties than the existing implementation of CRAB. For evaluating precision, we have integrated our analysis as a pre-processing step to SEABMC bounded model checker, and show that it is effective at both discharging assertions during pre-processing, and significantly improving the run-time of SEABMC. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14735v1-abstract-full').style.display = 'none'; document.getElementById('2411.14735v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This is an extended version of the VMCAI 2025 paper, consisting of 26 pages. The artifact is available at https://doi.org/10.5281/zenodo.13849174</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14525">arXiv:2411.14525</a> <span> [<a href="https://arxiv.org/pdf/2411.14525">pdf</a>, <a href="https://arxiv.org/format/2411.14525">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SegBook: A Simple Baseline and Cookbook for Volumetric Medical Image Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14525v1-abstract-short" style="display: inline;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14525v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14525v1-abstract-full" style="display: none;"> Computed Tomography (CT) is one of the most popular modalities for medical imaging. By far, CT images have contributed to the largest publicly available datasets for volumetric medical segmentation tasks, covering full-body anatomical structures. Large amounts of full-body CT images provide the opportunity to pre-train powerful models, e.g., STU-Net pre-trained in a supervised fashion, to segment numerous anatomical structures. However, it remains unclear in which conditions these pre-trained models can be transferred to various downstream medical segmentation tasks, particularly segmenting the other modalities and diverse targets. To address this problem, a large-scale benchmark for comprehensive evaluation is crucial for finding these conditions. Thus, we collected 87 public datasets varying in modality, target, and sample size to evaluate the transfer ability of full-body CT pre-trained models. We then employed a representative model, STU-Net with multiple model scales, to conduct transfer learning across modalities and targets. Our experimental results show that (1) there may be a bottleneck effect concerning the dataset size in fine-tuning, with more improvement on both small- and large-scale datasets than medium-size ones. (2) Models pre-trained on full-body CT demonstrate effective modality transfer, adapting well to other modalities such as MRI. (3) Pre-training on the full-body CT not only supports strong performance in structure detection but also shows efficacy in lesion detection, showcasing adaptability across target tasks. We hope that this large-scale open evaluation of transfer learning can direct future research in volumetric medical image segmentation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14525v1-abstract-full').style.display = 'none'; document.getElementById('2411.14525v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14522">arXiv:2411.14522</a> <span> [<a href="https://arxiv.org/pdf/2411.14522">pdf</a>, <a href="https://arxiv.org/format/2411.14522">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GMAI-VL & GMAI-VL-5.5M: A Large Vision-Language Model and A Comprehensive Multimodal Dataset Towards General Medical AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wei Li</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Ziyan Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chenglong Ma</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Ying Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+M">Ming Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanjun Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pengcheng Chen</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaowei Hu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhongying Deng</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yuanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14522v1-abstract-short" style="display: inline;"> Despite significant advancements in general artificial intelligence, such as GPT-4, their effectiveness in the medical domain (general medical AI, GMAI) remains constrained due to the absence of specialized medical knowledge. To address this challenge, we present GMAI-VL-5.5M, a comprehensive multimodal medical dataset created by converting hundreds of specialized medical datasets into meticulousl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14522v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14522v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14522v1-abstract-full" style="display: none;"> Despite significant advancements in general artificial intelligence, such as GPT-4, their effectiveness in the medical domain (general medical AI, GMAI) remains constrained due to the absence of specialized medical knowledge. To address this challenge, we present GMAI-VL-5.5M, a comprehensive multimodal medical dataset created by converting hundreds of specialized medical datasets into meticulously constructed image-text pairs. This dataset features comprehensive task coverage, diverse modalities, and high-quality image-text data. Building upon this multimodal dataset, we propose GMAI-VL, a general medical vision-language model with a progressively three-stage training strategy. This approach significantly enhances the model's ability by integrating visual and textual information, thereby improving its ability to process multimodal data and support accurate diagnosis and clinical decision-making. Experimental evaluations demonstrate that GMAI-VL achieves state-of-the-art results across a wide range of multimodal medical tasks, such as visual question answering and medical image diagnosis. Our contributions include the development of the GMAI-VL-5.5M dataset, the introduction of the GMAI-VL model, and the establishment of new benchmarks in multiple medical domains. Code and dataset will be released at https://github.com/uni-medical/GMAI-VL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14522v1-abstract-full').style.display = 'none'; document.getElementById('2411.14522v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13162">arXiv:2411.13162</a> <span> [<a href="https://arxiv.org/pdf/2411.13162">pdf</a>, <a href="https://arxiv.org/format/2411.13162">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> IC Mechanisms for Risk-Averse Advertisers in the Online Advertising System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingzhe Wang</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+R">Ruohan Qian</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Y">Yuejia Dou</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Q">Qi Qi</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+B">Bo Shen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Changyuan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yixuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yixin Su</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xin Yuan</a>, <a href="/search/cs?searchtype=author&query=liu%2C+W">Wenqiang liu</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+B">Bin Zou</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+W">Wen Yi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhi Guo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shuanglong Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+L">Liu Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13162v1-abstract-short" style="display: inline;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation be… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13162v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13162v1-abstract-full" style="display: none;"> The autobidding system generates huge revenue for advertising platforms, garnering substantial research attention. Existing studies in autobidding systems focus on designing Autobidding Incentive Compatible (AIC) mechanisms, where the mechanism is Incentive Compatible (IC) under ex ante expectations. However, upon deploying AIC mechanisms in advertising platforms, we observe a notable deviation between the actual auction outcomes and these expectations during runtime, particularly in the scene with few clicks (sparse-click). This discrepancy undermines truthful bidding among advertisers in AIC mechanisms, especially for risk-averse advertisers who are averse to outcomes that do not align with the expectations. To address this issue, we propose a mechanism, Decoupled First-Price Auction (DFP), that retains its IC property even during runtime. DFP dynamically adjusts the payment based on real-time user conversion outcomes, ensuring that advertisers' realized utilities closely approximate their expected utilities during runtime. To realize the payment mechanism of DFP, we propose a PPO-based RL algorithm, with a meticulously crafted reward function. This algorithm dynamically adjusts the payment to fit DFP mechanism. We conduct extensive experiments leveraging real-world data to validate our findings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13162v1-abstract-full').style.display = 'none'; document.getElementById('2411.13162v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v2-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v2-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'none'; document.getElementById('2411.12814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12749">arXiv:2411.12749</a> <span> [<a href="https://arxiv.org/pdf/2411.12749">pdf</a>, <a href="https://arxiv.org/format/2411.12749">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Predicting Lemmas in Generalization of IC3 </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuheng Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiusong Yang</a>, <a href="/search/cs?searchtype=author&query=Ci%2C+Y">Yiwei Ci</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12749v1-abstract-short" style="display: inline;"> The IC3 algorithm, also known as PDR, has made a significant impact in the field of safety model checking in recent years due to its high efficiency, scalability, and completeness. The most crucial component of IC3 is inductive generalization, which involves dropping variables one by one and is often the most time-consuming step. In this paper, we propose a novel approach to predict a possible min… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12749v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12749v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12749v1-abstract-full" style="display: none;"> The IC3 algorithm, also known as PDR, has made a significant impact in the field of safety model checking in recent years due to its high efficiency, scalability, and completeness. The most crucial component of IC3 is inductive generalization, which involves dropping variables one by one and is often the most time-consuming step. In this paper, we propose a novel approach to predict a possible minimal lemma before dropping variables by utilizing the counterexample to propagation (CTP). By leveraging this approach, we can avoid dropping variables if predict successfully. The comprehensive evaluation demonstrates a commendable success rate in lemma prediction and a significant performance improvement achieved by our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12749v1-abstract-full').style.display = 'none'; document.getElementById('2411.12749v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11507">arXiv:2411.11507</a> <span> [<a href="https://arxiv.org/pdf/2411.11507">pdf</a>, <a href="https://arxiv.org/format/2411.11507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SignEye: Traffic Sign Interpretation from Vehicle First-Person View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=SU%2C+Y">Yuejiao SU</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11507v1-abstract-short" style="display: inline;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction naviga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11507v1-abstract-full" style="display: none;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction navigation. Following the above issues, we introduce a new task: traffic sign interpretation from the vehicle's first-person view, referred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant (TGA) scenario application to re-explore the role of traffic signs in ADS as a complement to popular autonomous technologies (such as obstacle perception). Notably, TGA is not a replacement for electronic map navigation; rather, TGA can be an automatic tool for updating it and complementing it in situations such as offline conditions or temporary sign adjustments. Lastly, a spatial and semantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to achieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN) is built. Experiments show that TSI-FPV and TGA are achievable via our SignEye trained on Traffic-CN. The results also demonstrate that the TGA can provide complementary information to ADS beyond existing popular autonomous technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'none'; document.getElementById('2411.11507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09658">arXiv:2411.09658</a> <span> [<a href="https://arxiv.org/pdf/2411.09658">pdf</a>, <a href="https://arxiv.org/format/2411.09658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Motion Before Action: Diffusing Object Motion as Manipulation Condition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yue Su</a>, <a href="/search/cs?searchtype=author&query=Zhan%2C+X">Xinyu Zhan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+H">Hongjie Fang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong-Lu Li</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Cewu Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lixin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09658v2-abstract-short" style="display: inline;"> Inferring object motion representations from observations enhances the performance of robotic manipulation tasks. This paper introduces a new paradigm for robot imitation learning that generates action sequences by reasoning about object motion from visual observations. We propose MBA (Motion Before Action), a novel module that employs two cascaded diffusion processes for object motion generation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09658v2-abstract-full').style.display = 'inline'; document.getElementById('2411.09658v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09658v2-abstract-full" style="display: none;"> Inferring object motion representations from observations enhances the performance of robotic manipulation tasks. This paper introduces a new paradigm for robot imitation learning that generates action sequences by reasoning about object motion from visual observations. We propose MBA (Motion Before Action), a novel module that employs two cascaded diffusion processes for object motion generation and robot action generation under object motion guidance. MBA first predicts the future pose sequence of the object based on observations, then uses this sequence as a condition to guide robot action generation. Designed as a plug-and-play component, MBA can be flexibly integrated into existing robotic manipulation policies with diffusion action heads. Extensive experiments in both simulated and real-world environments demonstrate that our approach substantially improves the performance of existing policies across a wide range of manipulation tasks. Project page: https://selen-suyue.github.io/MBApage/ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09658v2-abstract-full').style.display = 'none'; document.getElementById('2411.09658v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07042">arXiv:2411.07042</a> <span> [<a href="https://arxiv.org/pdf/2411.07042">pdf</a>, <a href="https://arxiv.org/format/2411.07042">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Minion: A Technology Probe for Resolving Value Conflicts through Expert-Driven and User-Driven Strategies in AI Companion Applications </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fan%2C+X">Xianzhe Fan</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+Q">Qing Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xuhui Zhou</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuran Su</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhicong Lu</a>, <a href="/search/cs?searchtype=author&query=Sap%2C+M">Maarten Sap</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hong Shen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07042v1-abstract-short" style="display: inline;"> AI companions based on large language models can role-play and converse very naturally. When value conflicts arise between the AI companion and the user, it may offend or upset the user. Yet, little research has examined such conflicts. We first conducted a formative study that analyzed 151 user complaints about conflicts with AI companions, providing design implications for our study. Based on th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07042v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07042v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07042v1-abstract-full" style="display: none;"> AI companions based on large language models can role-play and converse very naturally. When value conflicts arise between the AI companion and the user, it may offend or upset the user. Yet, little research has examined such conflicts. We first conducted a formative study that analyzed 151 user complaints about conflicts with AI companions, providing design implications for our study. Based on these, we created Minion, a technology probe to help users resolve human-AI value conflicts. Minion applies a user-empowerment intervention method that provides suggestions by combining expert-driven and user-driven conflict resolution strategies. We conducted a technology probe study, creating 40 value conflict scenarios on Character.AI and Talkie. 22 participants completed 274 tasks and successfully resolved conflicts 94.16% of the time. We summarize user responses, preferences, and needs in resolving value conflicts, and propose design implications to reduce conflicts and empower users to resolve them more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07042v1-abstract-full').style.display = 'none'; document.getElementById('2411.07042v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 5 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06581">arXiv:2411.06581</a> <span> [<a href="https://arxiv.org/pdf/2411.06581">pdf</a>, <a href="https://arxiv.org/format/2411.06581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Federated LLMs Fine-tuned with Adaptive Importance-Aware LoRA </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yang Su</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+N">Na Yan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Y">Yansha Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06581v1-abstract-short" style="display: inline;"> Federated fine-tuning of pre-trained Large Language Models (LLMs) enables task-specific adaptation across diverse datasets while preserving data privacy. However, the large model size and heterogeneity in client resources pose significant computational and communication challenges. To address these issues, in this paper, we propose a novel Heterogeneous Adaptive Federated Low-Rank Adaptation (LoRA… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06581v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06581v1-abstract-full" style="display: none;"> Federated fine-tuning of pre-trained Large Language Models (LLMs) enables task-specific adaptation across diverse datasets while preserving data privacy. However, the large model size and heterogeneity in client resources pose significant computational and communication challenges. To address these issues, in this paper, we propose a novel Heterogeneous Adaptive Federated Low-Rank Adaptation (LoRA) fine-tuned LLM framework (HAFL). To accommodate client resource heterogeneity, we first introduce an importance-based parameter truncation scheme, which allows clients to have different LoRA ranks, and smoothed sensitivity scores are used as importance indicators. Despite its flexibility, the truncation process may cause performance degradation. To tackle this problem, we develop an importance-based parameter freezing scheme. In this approach, both the cloud server and clients maintain the same LoRA rank, while clients selectively update only the most important decomposed LoRA rank-1 matrices, keeping the rest frozen. To mitigate the information dilution caused by the zero-padding aggregation method, we propose an adaptive aggregation approach that operates at the decomposed rank-1 matrix level. Experiments on the 20 News Group classification task show that our method converges quickly with low communication size, and avoids performance degradation when distributing models to clients compared to truncation-based heterogeneous LoRA rank scheme. Additionally, our adaptive aggregation method achieves faster convergence compared to the zero-padding approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06581v1-abstract-full').style.display = 'none'; document.getElementById('2411.06581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06559">arXiv:2411.06559</a> <span> [<a href="https://arxiv.org/pdf/2411.06559">pdf</a>, <a href="https://arxiv.org/format/2411.06559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Is Your LLM Secretly a World Model of the Internet? Model-Based Planning for Web Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yu Gu</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Boyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Gou%2C+B">Boyu Gou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Srivastava%2C+S">Sanjari Srivastava</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanan Xie</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Peng Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huan Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06559v1-abstract-short" style="display: inline;"> Language agents have demonstrated promising capabilities in automating web-based tasks, though their current reactive approaches still underperform largely compared to humans. While incorporating advanced planning algorithms, particularly tree search methods, could enhance these agents' performance, implementing tree search directly on live websites poses significant safety risks and practical con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06559v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06559v1-abstract-full" style="display: none;"> Language agents have demonstrated promising capabilities in automating web-based tasks, though their current reactive approaches still underperform largely compared to humans. While incorporating advanced planning algorithms, particularly tree search methods, could enhance these agents' performance, implementing tree search directly on live websites poses significant safety risks and practical constraints due to irreversible actions such as confirming a purchase. In this paper, we introduce a novel paradigm that augments language agents with model-based planning, pioneering the innovative use of large language models (LLMs) as world models in complex web environments. Our method, WebDreamer, builds on the key insight that LLMs inherently encode comprehensive knowledge about website structures and functionalities. Specifically, WebDreamer uses LLMs to simulate outcomes for each candidate action (e.g., "what would happen if I click this button?") using natural language descriptions, and then evaluates these imagined outcomes to determine the optimal action at each step. Empirical results on two representative web agent benchmarks with online interaction -- VisualWebArena and Mind2Web-live -- demonstrate that WebDreamer achieves substantial improvements over reactive baselines. By establishing the viability of LLMs as world models in web environments, this work lays the groundwork for a paradigm shift in automated web interaction. More broadly, our findings open exciting new avenues for future research into 1) optimizing LLMs specifically for world modeling in complex, dynamic environments, and 2) model-based speculative planning for language agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06559v1-abstract-full').style.display = 'none'; document.getElementById('2411.06559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 6 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06248">arXiv:2411.06248</a> <span> [<a href="https://arxiv.org/pdf/2411.06248">pdf</a>, <a href="https://arxiv.org/format/2411.06248">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Robust Detection of LLM-Generated Text: A Comparative Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongye Su</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yuqing Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06248v1-abstract-short" style="display: inline;"> The ability of large language models to generate complex texts allows them to be widely integrated into many aspects of life, and their output can quickly fill all network resources. As the impact of LLMs grows, it becomes increasingly important to develop powerful detectors for the generated text. This detector is essential to prevent the potential misuse of these technologies and to protect area… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06248v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06248v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06248v1-abstract-full" style="display: none;"> The ability of large language models to generate complex texts allows them to be widely integrated into many aspects of life, and their output can quickly fill all network resources. As the impact of LLMs grows, it becomes increasingly important to develop powerful detectors for the generated text. This detector is essential to prevent the potential misuse of these technologies and to protect areas such as social media from the negative effects of false content generated by LLMS. The main goal of LLM-generated text detection is to determine whether text is generated by an LLM, which is a basic binary classification task. In our work, we mainly use three different classification methods based on open source datasets: traditional machine learning techniques such as logistic regression, k-means clustering, Gaussian Naive Bayes, support vector machines, and methods based on converters such as BERT, and finally algorithms that use LLMs to detect LLM-generated text. We focus on model generalization, potential adversarial attacks, and accuracy of model evaluation. Finally, the possible research direction in the future is proposed, and the current experimental results are summarized. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06248v1-abstract-full').style.display = 'none'; document.getElementById('2411.06248v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05806">arXiv:2411.05806</a> <span> [<a href="https://arxiv.org/pdf/2411.05806">pdf</a>, <a href="https://arxiv.org/format/2411.05806">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SkipSNN: Efficiently Classifying Spike Trains with Event-attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hang Yin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yao Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Liping Liu</a>, <a href="/search/cs?searchtype=author&query=Hartvigsen%2C+T">Thomas Hartvigsen</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+X">Xin Dai</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+X">Xiangnan Kong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05806v1-abstract-short" style="display: inline;"> Spike train classification has recently become an important topic in the machine learning community, where each spike train is a binary event sequence with \emph{temporal-sparsity of signals of interest} and \emph{temporal-noise} properties. A promising model for it should follow the design principle of performing intensive computation only when signals of interest appear. So such tasks use mainly… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05806v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05806v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05806v1-abstract-full" style="display: none;"> Spike train classification has recently become an important topic in the machine learning community, where each spike train is a binary event sequence with \emph{temporal-sparsity of signals of interest} and \emph{temporal-noise} properties. A promising model for it should follow the design principle of performing intensive computation only when signals of interest appear. So such tasks use mainly Spiking Neural Networks (SNNs) due to their consideration of temporal-sparsity of spike trains. However, the basic mechanism of SNNs ignore the temporal-noise issue, which makes them computationally expensive and thus high power consumption for analyzing spike trains on resource-constrained platforms. As an event-driven model, an SNN neuron makes a reaction given any input signals, making it difficult to quickly find signals of interest. In this paper, we introduce an event-attention mechanism that enables SNNs to dynamically highlight useful signals of the original spike trains. To this end, we propose SkipSNN, which extends existing SNN models by learning to mask out noise by skipping membrane potential updates and shortening the effective size of the computational graph. This process is analogous to how people choose to open and close their eyes to filter the information they see. We evaluate SkipSNN on various neuromorphic tasks and demonstrate that it achieves significantly better computational efficiency and classification accuracy than other state-of-the-art SNNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05806v1-abstract-full').style.display = 'none'; document.getElementById('2411.05806v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a research paper at IEEE BigData 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05278">arXiv:2411.05278</a> <span> [<a href="https://arxiv.org/pdf/2411.05278">pdf</a>, <a href="https://arxiv.org/format/2411.05278">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Integrated Location Sensing and Communication for Ultra-Massive MIMO With Hybrid-Field Beam-Squint Effect </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhen Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xingyu Zhou</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+B">Boyu Ning</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+T">Tong Qin</a>, <a href="/search/cs?searchtype=author&query=Niyato%2C+D">Dusit Niyato</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05278v1-abstract-short" style="display: inline;"> The advent of ultra-massive multiple-input-multiple output systems holds great promise for next-generation communications, yet their channels exhibit hybrid far- and near- field beam-squint (HFBS) effect. In this paper, we not only overcome but also harness the HFBS effect to propose an integrated location sensing and communication (ILSC) framework. During the uplink training stage, user terminals… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05278v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05278v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05278v1-abstract-full" style="display: none;"> The advent of ultra-massive multiple-input-multiple output systems holds great promise for next-generation communications, yet their channels exhibit hybrid far- and near- field beam-squint (HFBS) effect. In this paper, we not only overcome but also harness the HFBS effect to propose an integrated location sensing and communication (ILSC) framework. During the uplink training stage, user terminals (UTs) transmit reference signals for simultaneous channel estimation and location sensing. This stage leverages an elaborately designed hybrid-field projection matrix to overcome the HFBS effect and estimate the channel in compressive manner. Subsequently, the scatterers' locations can be sensed from the spherical wavefront based on the channel estimation results. By treating the sensed scatterers as virtual anchors, we employ a weighted least-squares approach to derive UT' s location. Moreover, we propose an iterative refinement mechanism, which utilizes the accurately estimated time difference of arrival of multipath components to enhance location sensing precision. In the following downlink data transmission stage, we leverage the acquired location information to further optimize the hybrid beamformer, which combines the beam broadening and focusing to mitigate the spectral efficiency degradation resulted from the HFBS effect. Extensive simulation experiments demonstrate that the proposed ILSC scheme has superior location sensing and communication performance than conventional methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05278v1-abstract-full').style.display = 'none'; document.getElementById('2411.05278v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by IEEE JSAC</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01403">arXiv:2411.01403</a> <span> [<a href="https://arxiv.org/pdf/2411.01403">pdf</a>, <a href="https://arxiv.org/format/2411.01403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TPOT: Topology Preserving Optimal Transport in Retinal Fundus Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xuanzhao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Guoxin Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Dumitrascu%2C+O+M">Oana M. Dumitrascu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01403v1-abstract-short" style="display: inline;"> Retinal fundus photography enhancement is important for diagnosing and monitoring retinal diseases. However, early approaches to retinal image enhancement, such as those based on Generative Adversarial Networks (GANs), often struggle to preserve the complex topological information of blood vessels, resulting in spurious or missing vessel structures. The persistence diagram, which captures topologi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01403v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01403v1-abstract-full" style="display: none;"> Retinal fundus photography enhancement is important for diagnosing and monitoring retinal diseases. However, early approaches to retinal image enhancement, such as those based on Generative Adversarial Networks (GANs), often struggle to preserve the complex topological information of blood vessels, resulting in spurious or missing vessel structures. The persistence diagram, which captures topological features based on the persistence of topological structures under different filtrations, provides a promising way to represent the structure information. In this work, we propose a topology-preserving training paradigm that regularizes blood vessel structures by minimizing the differences of persistence diagrams. We call the resulting framework Topology Preserving Optimal Transport (TPOT). Experimental results on a large-scale dataset demonstrate the superiority of the proposed method compared to several state-of-the-art supervised and unsupervised techniques, both in terms of image quality and performance in the downstream blood vessel segmentation task. The code is available at https://github.com/Retinal-Research/TPOT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01403v1-abstract-full').style.display = 'none'; document.getElementById('2411.01403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00023">arXiv:2411.00023</a> <span> [<a href="https://arxiv.org/pdf/2411.00023">pdf</a>, <a href="https://arxiv.org/format/2411.00023">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> </div> </div> <p class="title is-5 mathjax"> Device-Directed Speech Detection for Follow-up Conversations Using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ognjen"> Ognjen</a>, <a href="/search/cs?searchtype=author&query=Rudovic"> Rudovic</a>, <a href="/search/cs?searchtype=author&query=Dighe%2C+P">Pranay Dighe</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Garg%2C+V">Vineet Garg</a>, <a href="/search/cs?searchtype=author&query=Dharur%2C+S">Sameer Dharur</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+X">Xiaochuan Niu</a>, <a href="/search/cs?searchtype=author&query=Abdelaziz%2C+A+H">Ahmed H. Abdelaziz</a>, <a href="/search/cs?searchtype=author&query=Adya%2C+S">Saurabh Adya</a>, <a href="/search/cs?searchtype=author&query=Tewfik%2C+A">Ahmed Tewfik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00023v2-abstract-short" style="display: inline;"> Follow-up conversations with virtual assistants (VAs) enable a user to seamlessly interact with a VA without the need to repeatedly invoke it using a keyword (after the first query). Therefore, accurate Device-directed Speech Detection (DDSD) from the follow-up queries is critical for enabling naturalistic user experience. To this end, we explore the notion of Large Language Models (LLMs) and mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00023v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00023v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00023v2-abstract-full" style="display: none;"> Follow-up conversations with virtual assistants (VAs) enable a user to seamlessly interact with a VA without the need to repeatedly invoke it using a keyword (after the first query). Therefore, accurate Device-directed Speech Detection (DDSD) from the follow-up queries is critical for enabling naturalistic user experience. To this end, we explore the notion of Large Language Models (LLMs) and model the first query when making inference about the follow-ups (based on the ASR-decoded text), via prompting of a pretrained LLM, or by adapting a binary classifier on top of the LLM. In doing so, we also exploit the ASR uncertainty when designing the LLM prompts. We show on the real-world dataset of follow-up conversations that this approach yields large gains (20-40% reduction in false alarms at 10% fixed false rejects) due to the joint modeling of the previous speech context and ASR uncertainty, compared to when follow-ups are modeled alone. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00023v2-abstract-full').style.display = 'none'; document.getElementById('2411.00023v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20427">arXiv:2410.20427</a> <span> [<a href="https://arxiv.org/pdf/2410.20427">pdf</a>, <a href="https://arxiv.org/format/2410.20427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> YourSkatingCoach: A Figure Skating Video Benchmark for Fine-Grained Element Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Yi Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Ling Lin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-An Su</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+W">Wei-Hsin Yeh</a>, <a href="/search/cs?searchtype=author&query=Ku%2C+L">Lun-Wei Ku</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20427v2-abstract-short" style="display: inline;"> Combining sports and machine learning involves leveraging ML algorithms and techniques to extract insight from sports-related data such as player statistics, game footage, and other relevant information. However, datasets related to figure skating in the literature focus primarily on element classification and are currently unavailable or exhibit only limited access, which greatly raise the entry… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20427v2-abstract-full').style.display = 'inline'; document.getElementById('2410.20427v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20427v2-abstract-full" style="display: none;"> Combining sports and machine learning involves leveraging ML algorithms and techniques to extract insight from sports-related data such as player statistics, game footage, and other relevant information. However, datasets related to figure skating in the literature focus primarily on element classification and are currently unavailable or exhibit only limited access, which greatly raise the entry barrier to developing visual sports technology for it. Moreover, when using such data to help athletes improve their skills, we find they are very coarse-grained: they work for learning what an element is, but they are poorly suited to learning whether the element is good or bad. Here we propose air time detection, a novel motion analysis task, the goal of which is to accurately detect the duration of the air time of a jump. We present YourSkatingCoach, a large, novel figure skating dataset which contains 454 videos of jump elements, the detected skater skeletons in each video, along with the gold labels of the start and ending frames of each jump, together as a video benchmark for figure skating. In addition, although this type of task is often viewed as classification, we cast it as a sequential labeling problem and propose a Transformer-based model to calculate the duration. Experimental results show that the proposed model yields a favorable results for a strong baseline. To further verify the generalizability of the fine-grained labels, we apply the same process to other sports as cross-sports tasks but for coarse-grained task action classification. Here we fine-tune the classification to demonstrate that figure skating, as it contains the essential body movements, constitutes a strong foundation for adaptation to other sports. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20427v2-abstract-full').style.display = 'none'; document.getElementById('2410.20427v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20097">arXiv:2410.20097</a> <span> [<a href="https://arxiv.org/pdf/2410.20097">pdf</a>, <a href="https://arxiv.org/format/2410.20097">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Generative Adversarial Patches for Physical Attacks on Cross-Modal Pedestrian Re-Identification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yue Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hao Li</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Maoguo Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20097v1-abstract-short" style="display: inline;"> Visible-infrared pedestrian Re-identification (VI-ReID) aims to match pedestrian images captured by infrared cameras and visible cameras. However, VI-ReID, like other traditional cross-modal image matching tasks, poses significant challenges due to its human-centered nature. This is evidenced by the shortcomings of existing methods, which struggle to extract common features across modalities, whil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20097v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20097v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20097v1-abstract-full" style="display: none;"> Visible-infrared pedestrian Re-identification (VI-ReID) aims to match pedestrian images captured by infrared cameras and visible cameras. However, VI-ReID, like other traditional cross-modal image matching tasks, poses significant challenges due to its human-centered nature. This is evidenced by the shortcomings of existing methods, which struggle to extract common features across modalities, while losing valuable information when bridging the gap between them in the implicit feature space, potentially compromising security. To address this vulnerability, this paper introduces the first physical adversarial attack against VI-ReID models. Our method, termed Edge-Attack, specifically tests the models' ability to leverage deep-level implicit features by focusing on edge information, the most salient explicit feature differentiating individuals across modalities. Edge-Attack utilizes a novel two-step approach. First, a multi-level edge feature extractor is trained in a self-supervised manner to capture discriminative edge representations for each individual. Second, a generative model based on Vision Transformer Generative Adversarial Networks (ViTGAN) is employed to generate adversarial patches conditioned on the extracted edge features. By applying these patches to pedestrian clothing, we create realistic, physically-realizable adversarial samples. This black-box, self-supervised approach ensures the generalizability of our attack against various VI-ReID models. Extensive experiments on SYSU-MM01 and RegDB datasets, including real-world deployments, demonstrate the effectiveness of Edge- Attack in significantly degrading the performance of state-of-the-art VI-ReID methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20097v1-abstract-full').style.display = 'none'; document.getElementById('2410.20097v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18178">arXiv:2410.18178</a> <span> [<a href="https://arxiv.org/pdf/2410.18178">pdf</a>, <a href="https://arxiv.org/format/2410.18178">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> </div> </div> <p class="title is-5 mathjax"> Quantum linear system algorithm with optimal queries to initial state preparation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Low%2C+G+H">Guang Hao Low</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuan Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18178v1-abstract-short" style="display: inline;"> Quantum algorithms for linear systems produce the solution state $A^{-1}|b\rangle$ by querying two oracles: $O_A$ that block encodes the coefficient matrix and $O_b$ that prepares the initial state. We present a quantum linear system algorithm making $\mathbf螛\left(1/\sqrt{p}\right)$ queries to $O_b$, which is optimal in the success probability, and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18178v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18178v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18178v1-abstract-full" style="display: none;"> Quantum algorithms for linear systems produce the solution state $A^{-1}|b\rangle$ by querying two oracles: $O_A$ that block encodes the coefficient matrix and $O_b$ that prepares the initial state. We present a quantum linear system algorithm making $\mathbf螛\left(1/\sqrt{p}\right)$ queries to $O_b$, which is optimal in the success probability, and $\mathbf{O}\left(魏\log\left(1/p\right)\left(\log\log\left(1/p\right)+\log\left({1}/蔚\right)\right)\right)$ queries to $O_A$, nearly optimal in all parameters including the condition number and accuracy. Notably, our complexity scaling of initial state preparation holds even when $p$ is not known $\textit{a priori}$. This contrasts with recent results achieving $\mathbf{O}\left(魏\log\left({1}/蔚\right)\right)$ complexity to both oracles, which, while optimal in $O_A$, is highly suboptimal in $O_b$ as $魏$ can be arbitrarily larger than $1/\sqrt{p}$. In various applications such as solving differential equations, preparing ground states of operators with real spectra, and estimating and transforming eigenvalues of non-normal matrices, we can further improve the dependence on $p$ using a block preconditioning scheme to nearly match or outperform best previous results based on other methods, which also furnishes an extremely simple quantum linear system algorithm with an optimal query complexity to $O_A$. Underlying our results is a new Variable Time Amplitude Amplification algorithm with Tunable thresholds (Tunable VTAA), which fully characterizes generic nested amplitude amplifications, improves the $\ell_1$-norm input cost scaling of Ambainis to an $\ell_{\frac{2}{3}}$-quasinorm scaling, and admits a deterministic amplification schedule for the quantum linear system problem. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18178v1-abstract-full').style.display = 'none'; document.getElementById('2410.18178v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">86 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.14386">arXiv:2410.14386</a> <span> [<a href="https://arxiv.org/pdf/2410.14386">pdf</a>, <a href="https://arxiv.org/format/2410.14386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Earth and Planetary Astrophysics">astro-ph.EP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Methods for Astrophysics">astro-ph.IM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Investigating the Capabilities of Deep Learning for Processing and Interpreting One-Shot Multi-offset GPR Data: A Numerical Case Study for Lunar and Martian Environments </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Giannakis%2C+I">Iraklis Giannakis</a>, <a href="/search/cs?searchtype=author&query=Warren%2C+C">Craig Warren</a>, <a href="/search/cs?searchtype=author&query=Giannopoulos%2C+A">Antonios Giannopoulos</a>, <a href="/search/cs?searchtype=author&query=Leontidis%2C+G">Georgios Leontidis</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yan Su</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+F">Feng Zhou</a>, <a href="/search/cs?searchtype=author&query=Martin-Torres%2C+J">Javier Martin-Torres</a>, <a href="/search/cs?searchtype=author&query=Diamanti%2C+N">Nectaria Diamanti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.14386v1-abstract-short" style="display: inline;"> Ground-penetrating radar (GPR) is a mature geophysical method that has gained increasing popularity in planetary science over the past decade. GPR has been utilised both for Lunar and Martian missions providing pivotal information regarding the near surface geology of Terrestrial planets. Within that context, numerous processing pipelines have been suggested to address the unique challenges presen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14386v1-abstract-full').style.display = 'inline'; document.getElementById('2410.14386v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.14386v1-abstract-full" style="display: none;"> Ground-penetrating radar (GPR) is a mature geophysical method that has gained increasing popularity in planetary science over the past decade. GPR has been utilised both for Lunar and Martian missions providing pivotal information regarding the near surface geology of Terrestrial planets. Within that context, numerous processing pipelines have been suggested to address the unique challenges present in planetary setups. These processing pipelines often require manual tuning resulting to ambiguous outputs open to non-unique interpretations. These pitfalls combined with the large number of planetary GPR data (kilometers in magnitude), highlight the necessity for automatic, objective and advanced processing and interpretation schemes. The current paper investigates the potential of deep learning for interpreting and processing GPR data. The one-shot multi-offset configuration is investigated via a coherent numerical case study, showcasing the potential of deep learning for A) reconstructing the dielectric distribution of the the near surface of Terrestrial planets, and B) filling missing or bad-quality traces. Special care was taken for the numerical data to be both realistic and challenging. Moreover, the generated synthetic data are properly labelled and made publicly available for training future data-driven pipelines and contributing towards developing pre-trained foundation models for GPR. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.14386v1-abstract-full').style.display = 'none'; document.getElementById('2410.14386v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12388">arXiv:2410.12388</a> <span> [<a href="https://arxiv.org/pdf/2410.12388">pdf</a>, <a href="https://arxiv.org/format/2410.12388">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Prompt Compression for Large Language Models: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongqian Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yinhong Liu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yixuan Su</a>, <a href="/search/cs?searchtype=author&query=Collier%2C+N">Nigel Collier</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12388v2-abstract-short" style="display: inline;"> Leveraging large language models (LLMs) for complex natural language tasks typically requires long-form prompts to convey detailed requirements and information, which results in increased memory usage and inference costs. To mitigate these challenges, multiple efficient methods have been proposed, with prompt compression gaining significant research interest. This survey provides an overview of pr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12388v2-abstract-full').style.display = 'inline'; document.getElementById('2410.12388v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12388v2-abstract-full" style="display: none;"> Leveraging large language models (LLMs) for complex natural language tasks typically requires long-form prompts to convey detailed requirements and information, which results in increased memory usage and inference costs. To mitigate these challenges, multiple efficient methods have been proposed, with prompt compression gaining significant research interest. This survey provides an overview of prompt compression techniques, categorized into hard prompt methods and soft prompt methods. First, the technical approaches of these methods are compared, followed by an exploration of various ways to understand their mechanisms, including the perspectives of attention optimization, Parameter-Efficient Fine-Tuning (PEFT), modality integration, and new synthetic language. We also examine the downstream adaptations of various prompt compression techniques. Finally, the limitations of current prompt compression methods are analyzed, and several future directions are outlined, such as optimizing the compression encoder, combining hard and soft prompts methods, and leveraging insights from multimodality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12388v2-abstract-full').style.display = 'none'; document.getElementById('2410.12388v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11843">arXiv:2410.11843</a> <span> [<a href="https://arxiv.org/pdf/2410.11843">pdf</a>, <a href="https://arxiv.org/format/2410.11843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> From Commands to Prompts: LLM-based Semantic File System for AIOS </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zeru Shi</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+K">Kai Mei</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Mingyu Jin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongye Su</a>, <a href="/search/cs?searchtype=author&query=Zuo%2C+C">Chaoji Zuo</a>, <a href="/search/cs?searchtype=author&query=Hua%2C+W">Wenyue Hua</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wujiang Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yujie Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zirui Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+M">Mengnan Du</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+D">Dong Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongfeng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11843v1-abstract-short" style="display: inline;"> Large language models (LLMs) have demonstrated significant potential in the development of intelligent applications and systems such as LLM-based agents and agent operating systems (AIOS). However, when these applications and systems interact with the underlying file system, the file system still remains the traditional paradigm: reliant on manual navigation through precise commands. This paradigm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11843v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11843v1-abstract-full" style="display: none;"> Large language models (LLMs) have demonstrated significant potential in the development of intelligent applications and systems such as LLM-based agents and agent operating systems (AIOS). However, when these applications and systems interact with the underlying file system, the file system still remains the traditional paradigm: reliant on manual navigation through precise commands. This paradigm poses a bottleneck to the usability of these systems as users are required to navigate complex folder hierarchies and remember cryptic file names. To address this limitation, we propose an LLM-based semantic file system ( LSFS ) for prompt-driven file management. Unlike conventional approaches, LSFS incorporates LLMs to enable users or agents to interact with files through natural language prompts, facilitating semantic file management. At the macro-level, we develop a comprehensive API set to achieve semantic file management functionalities, such as semantic file retrieval, file update monitoring and summarization, and semantic file rollback). At the micro-level, we store files by constructing semantic indexes for them, design and implement syscalls of different semantic operations (e.g., CRUD, group by, join) powered by vector database. Our experiments show that LSFS offers significant improvements over traditional file systems in terms of user convenience, the diversity of supported functions, and the accuracy and efficiency of file operations. Additionally, with the integration of LLM, our system enables more intelligent file management tasks, such as content summarization and version comparison, further enhancing its capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11843v1-abstract-full').style.display = 'none'; document.getElementById('2410.11843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11824">arXiv:2410.11824</a> <span> [<a href="https://arxiv.org/pdf/2410.11824">pdf</a>, <a href="https://arxiv.org/format/2410.11824">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> KITTEN: A Knowledge-Intensive Evaluation of Image Generation on Visual Entities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsin-Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinyi Wang</a>, <a href="/search/cs?searchtype=author&query=Bitton%2C+Y">Yonatan Bitton</a>, <a href="/search/cs?searchtype=author&query=Taitelbaum%2C+H">Hagai Taitelbaum</a>, <a href="/search/cs?searchtype=author&query=Tomar%2C+G+S">Gaurav Singh Tomar</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+M">Ming-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xuhui Jia</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K+C+K">Kelvin C. K. Chan</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+H">Hexiang Hu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu-Chuan Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming-Hsuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11824v1-abstract-short" style="display: inline;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11824v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11824v1-abstract-full" style="display: none;"> Recent advancements in text-to-image generation have significantly enhanced the quality of synthesized images. Despite this progress, evaluations predominantly focus on aesthetic appeal or alignment with text prompts. Consequently, there is limited understanding of whether these models can accurately represent a wide variety of realistic visual entities - a task requiring real-world knowledge. To address this gap, we propose a benchmark focused on evaluating Knowledge-InTensive image generaTion on real-world ENtities (i.e., KITTEN). Using KITTEN, we conduct a systematic study on the fidelity of entities in text-to-image generation models, focusing on their ability to generate a wide range of real-world visual entities, such as landmark buildings, aircraft, plants, and animals. We evaluate the latest text-to-image models and retrieval-augmented customization models using both automatic metrics and carefully-designed human evaluations, with an emphasis on the fidelity of entities in the generated images. Our findings reveal that even the most advanced text-to-image models often fail to generate entities with accurate visual details. Although retrieval-augmented models can enhance the fidelity of entity by incorporating reference images during testing, they often over-rely on these references and struggle to produce novel configurations of the entity as requested in creative text prompts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11824v1-abstract-full').style.display = 'none'; document.getElementById('2410.11824v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://kitten-project.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11101">arXiv:2410.11101</a> <span> [<a href="https://arxiv.org/pdf/2410.11101">pdf</a>, <a href="https://arxiv.org/format/2410.11101">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> A Two-Stage Federated Learning Approach for Industrial Prognostics Using Large-Scale High-Dimensional Signals </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuqi Su</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+X">Xiaolei Fang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11101v1-abstract-short" style="display: inline;"> Industrial prognostics aims to develop data-driven methods that leverage high-dimensional degradation signals from assets to predict their failure times. The success of these models largely depends on the availability of substantial historical data for training. However, in practice, individual organizations often lack sufficient data to independently train reliable prognostic models, and privacy… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11101v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11101v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11101v1-abstract-full" style="display: none;"> Industrial prognostics aims to develop data-driven methods that leverage high-dimensional degradation signals from assets to predict their failure times. The success of these models largely depends on the availability of substantial historical data for training. However, in practice, individual organizations often lack sufficient data to independently train reliable prognostic models, and privacy concerns prevent data sharing between organizations for collaborative model training. To overcome these challenges, this article proposes a statistical learning-based federated model that enables multiple organizations to jointly train a prognostic model while keeping their data local and secure. The proposed approach involves two key stages: federated dimension reduction and federated (log)-location-scale regression. In the first stage, we develop a federated randomized singular value decomposition algorithm for multivariate functional principal component analysis, which efficiently reduces the dimensionality of degradation signals while maintaining data privacy. The second stage proposes a federated parameter estimation algorithm for (log)-location-scale regression, allowing organizations to collaboratively estimate failure time distributions without sharing raw data. The proposed approach addresses the limitations of existing federated prognostic methods by using statistical learning techniques that perform well with smaller datasets and provide comprehensive failure time distributions. The effectiveness and practicality of the proposed model are validated using simulated data and a dataset from the NASA repository. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11101v1-abstract-full').style.display = 'none'; document.getElementById('2410.11101v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08581">arXiv:2410.08581</a> <span> [<a href="https://arxiv.org/pdf/2410.08581">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Integrating AI for Enhanced Feedback in Translation Revision- A Mixed-Methods Investigation of Student Engagement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+S">Simin Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanfang Su</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kanglong Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08581v1-abstract-short" style="display: inline;"> Despite the well-established importance of feedback in education, the application of Artificial Intelligence (AI)-generated feedback, particularly from language models like ChatGPT, remains understudied in translation education. This study investigates the engagement of master's students in translation with ChatGPT-generated feedback during their revision process. A mixed-methods approach, combini… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08581v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08581v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08581v1-abstract-full" style="display: none;"> Despite the well-established importance of feedback in education, the application of Artificial Intelligence (AI)-generated feedback, particularly from language models like ChatGPT, remains understudied in translation education. This study investigates the engagement of master's students in translation with ChatGPT-generated feedback during their revision process. A mixed-methods approach, combining a translation-and-revision experiment with quantitative and qualitative analyses, was employed to examine the feedback, translations pre-and post-revision, the revision process, and student reflections. The results reveal complex interrelations among cognitive, affective, and behavioural dimensions influencing students' engagement with AI feedback and their subsequent revisions. Specifically, the findings indicate that students invested considerable cognitive effort in the revision process, despite finding the feedback comprehensible. Additionally, they exhibited moderate affective satisfaction with the feedback model. Behaviourally, their actions were largely influenced by cognitive and affective factors, although some inconsistencies were observed. This research provides novel insights into the potential applications of AI-generated feedback in translation teachingand opens avenues for further investigation into the integration of AI tools in language teaching settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08581v1-abstract-full').style.display = 'none'; document.getElementById('2410.08581v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06238">arXiv:2410.06238</a> <span> [<a href="https://arxiv.org/pdf/2410.06238">pdf</a>, <a href="https://arxiv.org/format/2410.06238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> EVOLvE: Evaluating and Optimizing LLMs For Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Nie%2C+A">Allen Nie</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Bo Chang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J+N">Jonathan N. Lee</a>, <a href="/search/cs?searchtype=author&query=Chi%2C+E+H">Ed H. Chi</a>, <a href="/search/cs?searchtype=author&query=Le%2C+Q+V">Quoc V. Le</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Minmin Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06238v1-abstract-short" style="display: inline;"> Despite their success in many domains, large language models (LLMs) remain under-studied in scenarios requiring optimal decision-making under uncertainty. This is crucial as many real-world applications, ranging from personalized recommendations to healthcare interventions, demand that LLMs not only predict but also actively learn to make optimal decisions through exploration. In this work, we mea… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06238v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06238v1-abstract-full" style="display: none;"> Despite their success in many domains, large language models (LLMs) remain under-studied in scenarios requiring optimal decision-making under uncertainty. This is crucial as many real-world applications, ranging from personalized recommendations to healthcare interventions, demand that LLMs not only predict but also actively learn to make optimal decisions through exploration. In this work, we measure LLMs' (in)ability to make optimal decisions in bandits, a state-less reinforcement learning setting relevant to many applications. We develop a comprehensive suite of environments, including both context-free and contextual bandits with varying task difficulties, to benchmark LLMs' performance. Motivated by the existence of optimal exploration algorithms, we propose efficient ways to integrate this algorithmic knowledge into LLMs: by providing explicit algorithm-guided support during inference; and through algorithm distillation via in-context demonstrations and fine-tuning, using synthetic data generated from these algorithms. Impressively, these techniques allow us to achieve superior exploration performance with smaller models, surpassing larger models on various tasks. We conducted an extensive ablation study to shed light on various factors, such as task difficulty and data representation, that influence the efficiency of LLM exploration. Additionally, we conduct a rigorous analysis of the LLM's exploration efficiency using the concept of regret, linking its ability to explore to the model size and underlying algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06238v1-abstract-full').style.display = 'none'; document.getElementById('2410.06238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05243">arXiv:2410.05243</a> <span> [<a href="https://arxiv.org/pdf/2410.05243">pdf</a>, <a href="https://arxiv.org/format/2410.05243">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Navigating the Digital World as Humans Do: Universal Visual Grounding for GUI Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gou%2C+B">Boyu Gou</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruohan Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+B">Boyuan Zheng</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yanan Xie</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+C">Cheng Chang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+Y">Yiheng Shu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huan Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05243v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) are transforming the capabilities of graphical user interface (GUI) agents, facilitating their transition from controlled simulations to complex, real-world applications across various platforms. However, the effectiveness of these agents hinges on the robustness of their grounding capability. Current GUI agents predominantly utilize text-based representati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05243v1-abstract-full').style.display = 'inline'; document.getElementById('2410.05243v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05243v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) are transforming the capabilities of graphical user interface (GUI) agents, facilitating their transition from controlled simulations to complex, real-world applications across various platforms. However, the effectiveness of these agents hinges on the robustness of their grounding capability. Current GUI agents predominantly utilize text-based representations such as HTML or accessibility trees, which, despite their utility, often introduce noise, incompleteness, and increased computational overhead. In this paper, we advocate a human-like embodiment for GUI agents that perceive the environment entirely visually and directly take pixel-level operations on the GUI. The key is visual grounding models that can accurately map diverse referring expressions of GUI elements to their coordinates on the GUI across different platforms. We show that a simple recipe, which includes web-based synthetic data and slight adaptation of the LLaVA architecture, is surprisingly effective for training such visual grounding models. We collect the largest dataset for GUI visual grounding so far, containing 10M GUI elements and their referring expressions over 1.3M screenshots, and use it to train UGround, a strong universal visual grounding model for GUI agents. Empirical results on six benchmarks spanning three categories (grounding, offline agent, and online agent) show that 1) UGround substantially outperforms existing visual grounding models for GUI agents, by up to 20% absolute, and 2) agents with UGround outperform state-of-the-art agents, despite the fact that existing agents use additional text-based input while ours only uses visual perception. These results provide strong support for the feasibility and promises of GUI agents that navigate the digital world as humans do. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05243v1-abstract-full').style.display = 'none'; document.getElementById('2410.05243v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.05080">arXiv:2410.05080</a> <span> [<a href="https://arxiv.org/pdf/2410.05080">pdf</a>, <a href="https://arxiv.org/format/2410.05080">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> ScienceAgentBench: Toward Rigorous Assessment of Language Agents for Data-Driven Scientific Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziru Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shijie Chen</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+Y">Yuting Ning</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qianheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Boshi Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Botao Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yifei Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zeyi Liao</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+C">Chen Wei</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zitong Lu</a>, <a href="/search/cs?searchtype=author&query=Dey%2C+V">Vishal Dey</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+M">Mingyi Xue</a>, <a href="/search/cs?searchtype=author&query=Baker%2C+F+N">Frazier N. Baker</a>, <a href="/search/cs?searchtype=author&query=Burns%2C+B">Benjamin Burns</a>, <a href="/search/cs?searchtype=author&query=Adu-Ampratwum%2C+D">Daniel Adu-Ampratwum</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xuhui Huang</a>, <a href="/search/cs?searchtype=author&query=Ning%2C+X">Xia Ning</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Song Gao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huan Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.05080v2-abstract-short" style="display: inline;"> The advancements of language language models (LLMs) have piqued growing interest in developing LLM-based language agents to automate scientific discovery end-to-end, which has sparked both excitement and skepticism about their true capabilities. In this work, we call for rigorous assessment of agents on individual tasks in a scientific workflow before making bold claims on end-to-end automation. T… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05080v2-abstract-full').style.display = 'inline'; document.getElementById('2410.05080v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.05080v2-abstract-full" style="display: none;"> The advancements of language language models (LLMs) have piqued growing interest in developing LLM-based language agents to automate scientific discovery end-to-end, which has sparked both excitement and skepticism about their true capabilities. In this work, we call for rigorous assessment of agents on individual tasks in a scientific workflow before making bold claims on end-to-end automation. To ensure the scientific authenticity and real-world relevance of our benchmark, we extract 102 tasks from 44 peer-reviewed publications in four disciplines and engage nine subject matter experts to validate them. We unify the target output for every task to a self-contained Python program file and employ an array of evaluation metrics to examine the generated programs, execution results, and costs. Each task goes through multiple rounds of manual validation by annotators and subject matter experts to ensure its annotation quality and scientific plausibility. We also propose two effective strategies to mitigate data contamination concerns. Using our benchmark, we evaluate five open-weight and proprietary LLMs, each with three frameworks: direct prompting, OpenHands CodeAct, and self-debug. Given three attempts for each task, the best-performing agent can only solve 32.4% of the tasks independently and 34.3% with expert-provided knowledge. In addition, we evaluate OpenAI o1 with direct prompting and self-debug, which demonstrates the effectiveness of increasing inference-time compute. Still, our results underscore the limitations of current language agents in generating code for data-driven discovery, let alone end-to-end automation for scientific research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.05080v2-abstract-full').style.display = 'none'; document.getElementById('2410.05080v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 7 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">57 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04682">arXiv:2410.04682</a> <span> [<a href="https://arxiv.org/pdf/2410.04682">pdf</a>, <a href="https://arxiv.org/format/2410.04682">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> On the Adversarial Risk of Test Time Adaptation: An Investigation into Realistic Test-Time Data Poisoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongyi Su</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yushu Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+K">Kui Jia</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xulei Yang</a>, <a href="/search/cs?searchtype=author&query=Foo%2C+C">Chuan-Sheng Foo</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04682v2-abstract-short" style="display: inline;"> Test-time adaptation (TTA) updates the model weights during the inference stage using testing data to enhance generalization. However, this practice exposes TTA to adversarial risks. Existing studies have shown that when TTA is updated with crafted adversarial test samples, also known as test-time poisoned data, the performance on benign samples can deteriorate. Nonetheless, the perceived adversar… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04682v2-abstract-full').style.display = 'inline'; document.getElementById('2410.04682v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04682v2-abstract-full" style="display: none;"> Test-time adaptation (TTA) updates the model weights during the inference stage using testing data to enhance generalization. However, this practice exposes TTA to adversarial risks. Existing studies have shown that when TTA is updated with crafted adversarial test samples, also known as test-time poisoned data, the performance on benign samples can deteriorate. Nonetheless, the perceived adversarial risk may be overstated if the poisoned data is generated under overly strong assumptions. In this work, we first review realistic assumptions for test-time data poisoning, including white-box versus grey-box attacks, access to benign data, attack budget, and more. We then propose an effective and realistic attack method that better produces poisoned samples without access to benign samples, and derive an effective in-distribution attack objective. We also design two TTA-aware attack objectives. Our benchmarks of existing attack methods reveal that the TTA methods are more robust than previously believed. In addition, we analyze effective defense strategies to help develop adversarially robust TTA methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04682v2-abstract-full').style.display = 'none'; document.getElementById('2410.04682v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 4 figures, 8 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.04409">arXiv:2410.04409</a> <span> [<a href="https://arxiv.org/pdf/2410.04409">pdf</a>, <a href="https://arxiv.org/format/2410.04409">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Quantum Approximate Optimization Algorithms for Maxmimum Cut on Low-Girth Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+T">Tongyang Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuexin Su</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.04409v1-abstract-short" style="display: inline;"> Maximum cut (MaxCut) on graphs is a classic NP-hard problem. In quantum computing, Farhi, Gutmann, and Goldstone proposed the Quantum Approximate Optimization Algorithm (QAOA) for solving the MaxCut problem. Its guarantee on cut fraction (the fraction of edges in the output cut over all edges) was mainly studied for high-girth graphs, i.e., graphs with only long cycles. On the other hand, low-girt… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04409v1-abstract-full').style.display = 'inline'; document.getElementById('2410.04409v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.04409v1-abstract-full" style="display: none;"> Maximum cut (MaxCut) on graphs is a classic NP-hard problem. In quantum computing, Farhi, Gutmann, and Goldstone proposed the Quantum Approximate Optimization Algorithm (QAOA) for solving the MaxCut problem. Its guarantee on cut fraction (the fraction of edges in the output cut over all edges) was mainly studied for high-girth graphs, i.e., graphs with only long cycles. On the other hand, low-girth graphs are ubiquitous in theoretical computer science, including expander graphs being outstanding examples with wide applications in theory and beyond. In this paper, we apply QAOA to MaxCut on a set of expander graphs proposed by Mohanty and O'Donnell known as additive product graphs. Additionally, we apply multi-angle QAOA (ma-QAOA) to better utilize the graph structure of additive product graphs in ansatz design. In theory, we derive an iterative formula to calculate the expected cut fraction of such graphs. On the other hand, we conduct numerical experiments to compare between best-known classical local algorithms and QAOA with constant depth. Our results demonstrate that QAOA outperforms the best-known classical algorithms by 0.3% to 5.2% on several additive product graphs, while ma-QAOA further enhances this advantage by an additional 0.6% to 2.5%. In particular, we observe cases that ma-QAOA exhibits superiority over best-known classical algorithms but QAOA does not. Furthermore, we extend our experiments to planar graphs such as tiling grid graphs, where QAOA also demonstrates an advantage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.04409v1-abstract-full').style.display = 'none'; document.getElementById('2410.04409v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03907">arXiv:2410.03907</a> <span> [<a href="https://arxiv.org/pdf/2410.03907">pdf</a>, <a href="https://arxiv.org/format/2410.03907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ActPlan-1K: Benchmarking the Procedural Planning Ability of Visual Language Models in Household Activities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Ying Su</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhan Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haochen Shi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiayang Cheng</a>, <a href="/search/cs?searchtype=author&query=Yim%2C+Y">Yauwai Yim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03907v1-abstract-short" style="display: inline;"> Large language models~(LLMs) have been adopted to process textual task description and accomplish procedural planning in embodied AI tasks because of their powerful reasoning ability. However, there is still lack of study on how vision language models~(VLMs) behave when multi-modal task inputs are considered. Counterfactual planning that evaluates the model's reasoning ability over alternative tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03907v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03907v1-abstract-full" style="display: none;"> Large language models~(LLMs) have been adopted to process textual task description and accomplish procedural planning in embodied AI tasks because of their powerful reasoning ability. However, there is still lack of study on how vision language models~(VLMs) behave when multi-modal task inputs are considered. Counterfactual planning that evaluates the model's reasoning ability over alternative task situations are also under exploited. In order to evaluate the planning ability of both multi-modal and counterfactual aspects, we propose ActPlan-1K. ActPlan-1K is a multi-modal planning benchmark constructed based on ChatGPT and household activity simulator iGibson2. The benchmark consists of 153 activities and 1,187 instances. Each instance describing one activity has a natural language task description and multiple environment images from the simulator. The gold plan of each instance is action sequences over the objects in provided scenes. Both the correctness and commonsense satisfaction are evaluated on typical VLMs. It turns out that current VLMs are still struggling at generating human-level procedural plans for both normal activities and counterfactual activities. We further provide automatic evaluation metrics by finetuning over BLEURT model to facilitate future research on our benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03907v1-abstract-full').style.display = 'none'; document.getElementById('2410.03907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures, 8 tables, accepted to EMNLP 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03227">arXiv:2410.03227</a> <span> [<a href="https://arxiv.org/pdf/2410.03227">pdf</a>, <a href="https://arxiv.org/format/2410.03227">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ALR$^2$: A Retrieve-then-Reason Framework for Long-context Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayang Li</a>, <a href="/search/cs?searchtype=author&query=Verga%2C+P">Pat Verga</a>, <a href="/search/cs?searchtype=author&query=Sen%2C+P">Priyanka Sen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bowen Yang</a>, <a href="/search/cs?searchtype=author&query=Viswanathan%2C+V">Vijay Viswanathan</a>, <a href="/search/cs?searchtype=author&query=Lewis%2C+P">Patrick Lewis</a>, <a href="/search/cs?searchtype=author&query=Watanabe%2C+T">Taro Watanabe</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yixuan Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03227v1-abstract-short" style="display: inline;"> The context window of large language models (LLMs) has been extended significantly in recent years. However, while the context length that the LLM can process has grown, the capability of the model to accurately reason over that context degrades noticeably. This occurs because modern LLMs often become overwhelmed by the vast amount of information in the context; when answering questions, the model… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03227v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03227v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03227v1-abstract-full" style="display: none;"> The context window of large language models (LLMs) has been extended significantly in recent years. However, while the context length that the LLM can process has grown, the capability of the model to accurately reason over that context degrades noticeably. This occurs because modern LLMs often become overwhelmed by the vast amount of information in the context; when answering questions, the model must identify and reason over relevant evidence sparsely distributed throughout the text. To alleviate the challenge of long-context reasoning, we develop a retrieve-then-reason framework, enabling LLMs to reason over relevant evidence collected during an intermediate retrieval step. We find that modern LLMs struggle to accurately retrieve relevant facts and instead, often hallucinate "retrieved facts", resulting in flawed reasoning and the production of incorrect answers. To address these issues, we introduce ALR$^2$, a method that augments the long-context reasoning capability of LLMs via an explicit two-stage procedure, i.e., aligning LLMs with the objectives of both retrieval and reasoning. We demonstrate the efficacy of ALR$^2$ for mitigating performance degradation in long-context reasoning tasks. Through extensive experiments on long-context QA benchmarks, we find our method to outperform competitive baselines by large margins, achieving at least 8.4 and 7.9 EM gains on the long-context versions of HotpotQA and SQuAD datasets, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03227v1-abstract-full').style.display = 'none'; document.getElementById('2410.03227v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.02642">arXiv:2410.02642</a> <span> [<a href="https://arxiv.org/pdf/2410.02642">pdf</a>, <a href="https://arxiv.org/format/2410.02642">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Attention in Large Language Models Yields Efficient Zero-Shot Re-Rankers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shijie Chen</a>, <a href="/search/cs?searchtype=author&query=Guti%C3%A9rrez%2C+B+J">Bernal Jim茅nez Guti茅rrez</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.02642v1-abstract-short" style="display: inline;"> Information retrieval (IR) systems have played a vital role in modern digital life and have cemented their continued usefulness in this new era of generative AI via retrieval-augmented generation. With strong language processing capabilities and remarkable versatility, large language models (LLMs) have become popular choices for zero-shot re-ranking in IR systems. So far, LLM-based re-ranking meth… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02642v1-abstract-full').style.display = 'inline'; document.getElementById('2410.02642v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.02642v1-abstract-full" style="display: none;"> Information retrieval (IR) systems have played a vital role in modern digital life and have cemented their continued usefulness in this new era of generative AI via retrieval-augmented generation. With strong language processing capabilities and remarkable versatility, large language models (LLMs) have become popular choices for zero-shot re-ranking in IR systems. So far, LLM-based re-ranking methods rely on strong generative capabilities, which restricts their use to either specialized or powerful proprietary models. Given these restrictions, we ask: is autoregressive generation necessary and optimal for LLMs to perform re-ranking? We hypothesize that there are abundant signals relevant to re-ranking within LLMs that might not be used to their full potential via generation. To more directly leverage such signals, we propose in-context re-ranking (ICR), a novel method that leverages the change in attention pattern caused by the search query for accurate and efficient re-ranking. To mitigate the intrinsic biases in LLMs, we propose a calibration method using a content-free query. Due to the absence of generation, ICR only requires two ($O(1)$) forward passes to re-rank $N$ documents, making it substantially more efficient than generative re-ranking methods that require at least $O(N)$ forward passes. Our novel design also enables ICR to be applied to any LLM without specialized training while guaranteeing a well-formed ranking. Extensive experiments with two popular open-weight LLMs on standard single-hop and multi-hop information retrieval benchmarks show that ICR outperforms RankGPT while cutting the latency by more than 60% in practice. Through detailed analyses, we show that ICR's performance is specially strong on tasks that require more complex re-ranking signals. Our findings call for further exploration on novel ways of utilizing open-weight LLMs beyond text generation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.02642v1-abstract-full').style.display = 'none'; document.getElementById('2410.02642v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20559">arXiv:2409.20559</a> <span> [<a href="https://arxiv.org/pdf/2409.20559">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Supervised Multi-Modal Fission Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+L">Lingchao Mao</a>, <a href="/search/cs?searchtype=author&query=wang%2C+Q">Qi wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Lure%2C+F">Fleming Lure</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jing Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20559v1-abstract-short" style="display: inline;"> Learning from multimodal datasets can leverage complementary information and improve performance in prediction tasks. A commonly used strategy to account for feature correlations in high-dimensional datasets is the latent variable approach. Several latent variable methods have been proposed for multimodal datasets. However, these methods either focus on extracting the shared component across all m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20559v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20559v1-abstract-full" style="display: none;"> Learning from multimodal datasets can leverage complementary information and improve performance in prediction tasks. A commonly used strategy to account for feature correlations in high-dimensional datasets is the latent variable approach. Several latent variable methods have been proposed for multimodal datasets. However, these methods either focus on extracting the shared component across all modalities or on extracting both a shared component and individual components specific to each modality. To address this gap, we propose a Multi-Modal Fission Learning (MMFL) model that simultaneously identifies globally joint, partially joint, and individual components underlying the features of multimodal datasets. Unlike existing latent variable methods, MMFL uses supervision from the response variable to identify predictive latent components and has a natural extension for incorporating incomplete multimodal data. Through simulation studies, we demonstrate that MMFL outperforms various existing multimodal algorithms in both complete and incomplete modality settings. We applied MMFL to a real-world case study for early prediction of Alzheimers Disease using multimodal neuroimaging and genomics data from the Alzheimers Disease Neuroimaging Initiative (ADNI) dataset. MMFL provided more accurate predictions and better insights into within- and across-modality correlations compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20559v1-abstract-full').style.display = 'none'; document.getElementById('2409.20559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.19527">arXiv:2409.19527</a> <span> [<a href="https://arxiv.org/pdf/2409.19527">pdf</a>, <a href="https://arxiv.org/format/2409.19527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> BuildingView: Constructing Urban Building Exteriors Databases with Street View Imagery and Multimodal Large Language Mode </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongrong Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yunlei Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenyuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wufan Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.19527v1-abstract-short" style="display: inline;"> Urban Building Exteriors are increasingly important in urban analytics, driven by advancements in Street View Imagery and its integration with urban research. Multimodal Large Language Models (LLMs) offer powerful tools for urban annotation, enabling deeper insights into urban environments. However, challenges remain in creating accurate and detailed urban building exterior databases, identifying… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19527v1-abstract-full').style.display = 'inline'; document.getElementById('2409.19527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.19527v1-abstract-full" style="display: none;"> Urban Building Exteriors are increasingly important in urban analytics, driven by advancements in Street View Imagery and its integration with urban research. Multimodal Large Language Models (LLMs) offer powerful tools for urban annotation, enabling deeper insights into urban environments. However, challenges remain in creating accurate and detailed urban building exterior databases, identifying critical indicators for energy efficiency, environmental sustainability, and human-centric design, and systematically organizing these indicators. To address these challenges, we propose BuildingView, a novel approach that integrates high-resolution visual data from Google Street View with spatial information from OpenStreetMap via the Overpass API. This research improves the accuracy of urban building exterior data, identifies key sustainability and design indicators, and develops a framework for their extraction and categorization. Our methodology includes a systematic literature review, building and Street View sampling, and annotation using the ChatGPT-4O API. The resulting database, validated with data from New York City, Amsterdam, and Singapore, provides a comprehensive tool for urban studies, supporting informed decision-making in urban planning, architectural design, and environmental policy. The code for BuildingView is available at https://github.com/Jasper0122/BuildingView. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.19527v1-abstract-full').style.display = 'none'; document.getElementById('2409.19527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.16223">arXiv:2409.16223</a> <span> [<a href="https://arxiv.org/pdf/2409.16223">pdf</a>, <a href="https://arxiv.org/format/2409.16223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Fine-Tuning is Fine, if Calibrated </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mai%2C+Z">Zheda Mai</a>, <a href="/search/cs?searchtype=author&query=Chowdhury%2C+A">Arpita Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Ping Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+C">Cheng-Hao Tu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hong-You Chen</a>, <a href="/search/cs?searchtype=author&query=Pahuja%2C+V">Vardaan Pahuja</a>, <a href="/search/cs?searchtype=author&query=Berger-Wolf%2C+T">Tanya Berger-Wolf</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Song Gao</a>, <a href="/search/cs?searchtype=author&query=Stewart%2C+C">Charles Stewart</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Chao%2C+W">Wei-Lun Chao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.16223v3-abstract-short" style="display: inline;"> Fine-tuning is arguably the most straightforward way to tailor a pre-trained model (e.g., a foundation model) to downstream applications, but it also comes with the risk of losing valuable knowledge the model had learned in pre-training. For example, fine-tuning a pre-trained classifier capable of recognizing a large number of classes to master a subset of classes at hand is shown to drastically d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16223v3-abstract-full').style.display = 'inline'; document.getElementById('2409.16223v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.16223v3-abstract-full" style="display: none;"> Fine-tuning is arguably the most straightforward way to tailor a pre-trained model (e.g., a foundation model) to downstream applications, but it also comes with the risk of losing valuable knowledge the model had learned in pre-training. For example, fine-tuning a pre-trained classifier capable of recognizing a large number of classes to master a subset of classes at hand is shown to drastically degrade the model's accuracy in the other classes it had previously learned. As such, it is hard to further use the fine-tuned model when it encounters classes beyond the fine-tuning data. In this paper, we systematically dissect the issue, aiming to answer the fundamental question, "What has been damaged in the fine-tuned model?" To our surprise, we find that the fine-tuned model neither forgets the relationship among the other classes nor degrades the features to recognize these classes. Instead, the fine-tuned model often produces more discriminative features for these other classes, even if they were missing during fine-tuning! {What really hurts the accuracy is the discrepant logit scales between the fine-tuning classes and the other classes}, implying that a simple post-processing calibration would bring back the pre-trained model's capability and at the same time unveil the feature improvement over all classes. We conduct an extensive empirical study to demonstrate the robustness of our findings and provide preliminary explanations underlying them, suggesting new directions for future theoretical analysis. Our code is available at https://github.com/OSU-MLB/Fine-Tuning-Is-Fine-If-Calibrated. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.16223v3-abstract-full').style.display = 'none'; document.getElementById('2409.16223v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The paper has been accepted to NeurIPS 2024. The first three authors contribute equally</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13401">arXiv:2409.13401</a> <span> [<a href="https://arxiv.org/pdf/2409.13401">pdf</a>, <a href="https://arxiv.org/format/2409.13401">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PointSAM: Pointly-Supervised Segment Anything Model for Remote Sensing Images </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+N">Nanqing Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xun Xu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yongyi Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haojie Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Heng-Chao Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13401v1-abstract-short" style="display: inline;"> Segment Anything Model (SAM) is an advanced foundational model for image segmentation, widely applied to remote sensing images (RSIs). Due to the domain gap between RSIs and natural images, traditional methods typically use SAM as a source pre-trained model and fine-tune it with fully supervised masks. Unlike these methods, our work focuses on fine-tuning SAM using more convenient and challenging… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13401v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13401v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13401v1-abstract-full" style="display: none;"> Segment Anything Model (SAM) is an advanced foundational model for image segmentation, widely applied to remote sensing images (RSIs). Due to the domain gap between RSIs and natural images, traditional methods typically use SAM as a source pre-trained model and fine-tune it with fully supervised masks. Unlike these methods, our work focuses on fine-tuning SAM using more convenient and challenging point annotations. Leveraging SAM's zero-shot capabilities, we adopt a self-training framework that iteratively generates pseudo-labels for training. However, if the pseudo-labels contain noisy labels, there is a risk of error accumulation. To address this issue, we extract target prototypes from the target dataset and use the Hungarian algorithm to match them with prediction prototypes, preventing the model from learning in the wrong direction. Additionally, due to the complex backgrounds and dense distribution of objects in RSI, using point prompts may result in multiple objects being recognized as one. To solve this problem, we propose a negative prompt calibration method based on the non-overlapping nature of instance masks. In brief, we use the prompts of overlapping masks as corresponding negative signals, resulting in refined masks. Combining the above methods, we propose a novel Pointly-supervised Segment Anything Model named PointSAM. We conduct experiments on RSI datasets, including WHU, HRSID, and NWPU VHR-10, and the results show that our method significantly outperforms direct testing with SAM, SAM2, and other comparison methods. Furthermore, we introduce PointSAM as a point-to-box converter and achieve encouraging results, suggesting that this method can be extended to other point-supervised tasks. The code is available at https://github.com/Lans1ng/PointSAM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13401v1-abstract-full').style.display = 'none'; document.getElementById('2409.13401v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.12917">arXiv:2409.12917</a> <span> [<a href="https://arxiv.org/pdf/2409.12917">pdf</a>, <a href="https://arxiv.org/format/2409.12917">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Training Language Models to Self-Correct via Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kumar%2C+A">Aviral Kumar</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+V">Vincent Zhuang</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Co-Reyes%2C+J+D">John D Co-Reyes</a>, <a href="/search/cs?searchtype=author&query=Singh%2C+A">Avi Singh</a>, <a href="/search/cs?searchtype=author&query=Baumli%2C+K">Kate Baumli</a>, <a href="/search/cs?searchtype=author&query=Iqbal%2C+S">Shariq Iqbal</a>, <a href="/search/cs?searchtype=author&query=Bishop%2C+C">Colton Bishop</a>, <a href="/search/cs?searchtype=author&query=Roelofs%2C+R">Rebecca Roelofs</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L+M">Lei M Zhang</a>, <a href="/search/cs?searchtype=author&query=McKinney%2C+K">Kay McKinney</a>, <a href="/search/cs?searchtype=author&query=Shrivastava%2C+D">Disha Shrivastava</a>, <a href="/search/cs?searchtype=author&query=Paduraru%2C+C">Cosmin Paduraru</a>, <a href="/search/cs?searchtype=author&query=Tucker%2C+G">George Tucker</a>, <a href="/search/cs?searchtype=author&query=Precup%2C+D">Doina Precup</a>, <a href="/search/cs?searchtype=author&query=Behbahani%2C+F">Feryal Behbahani</a>, <a href="/search/cs?searchtype=author&query=Faust%2C+A">Aleksandra Faust</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.12917v2-abstract-short" style="display: inline;"> Self-correction is a highly desirable capability of large language models (LLMs), yet it has consistently been found to be largely ineffective in modern LLMs. Current methods for training self-correction typically depend on either multiple models, a more advanced model, or additional forms of supervision. To address these shortcomings, we develop a multi-turn online reinforcement learning (RL) app… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12917v2-abstract-full').style.display = 'inline'; document.getElementById('2409.12917v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.12917v2-abstract-full" style="display: none;"> Self-correction is a highly desirable capability of large language models (LLMs), yet it has consistently been found to be largely ineffective in modern LLMs. Current methods for training self-correction typically depend on either multiple models, a more advanced model, or additional forms of supervision. To address these shortcomings, we develop a multi-turn online reinforcement learning (RL) approach, SCoRe, that significantly improves an LLM's self-correction ability using entirely self-generated data. To build SCoRe, we first show that variants of supervised fine-tuning (SFT) on offline model-generated correction traces are often insufficient for instilling self-correction behavior. In particular, we observe that training via SFT falls prey to either a distribution mismatch between mistakes made by the data-collection policy and the model's own responses, or to behavior collapse, where learning implicitly prefers only a certain mode of correction behavior that is often not effective at self-correction on test problems. SCoRe addresses these challenges by training under the model's own distribution of self-generated correction traces and using appropriate regularization to steer the learning process into learning a self-correction behavior that is effective at test time as opposed to fitting high-reward responses for a given prompt. This regularization process includes an initial phase of multi-turn RL on a base model to generate a policy initialization that is less susceptible to collapse, followed by using a reward bonus to amplify self-correction. With Gemini 1.0 Pro and 1.5 Flash models, we find that SCoRe achieves state-of-the-art self-correction performance, improving the base models' self-correction by 15.6% and 9.1% respectively on MATH and HumanEval. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.12917v2-abstract-full').style.display = 'none'; document.getElementById('2409.12917v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11624">arXiv:2409.11624</a> <span> [<a href="https://arxiv.org/pdf/2409.11624">pdf</a>, <a href="https://arxiv.org/format/2409.11624">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Multimodal Generalized Category Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuchang Su</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Renping Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+S">Siyu Huang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xingjian Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tianyang Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziyue Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Min Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11624v1-abstract-short" style="display: inline;"> Generalized Category Discovery (GCD) aims to classify inputs into both known and novel categories, a task crucial for open-world scientific discoveries. However, current GCD methods are limited to unimodal data, overlooking the inherently multimodal nature of most real-world data. In this work, we extend GCD to a multimodal setting, where inputs from different modalities provide richer and complem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11624v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11624v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11624v1-abstract-full" style="display: none;"> Generalized Category Discovery (GCD) aims to classify inputs into both known and novel categories, a task crucial for open-world scientific discoveries. However, current GCD methods are limited to unimodal data, overlooking the inherently multimodal nature of most real-world data. In this work, we extend GCD to a multimodal setting, where inputs from different modalities provide richer and complementary information. Through theoretical analysis and empirical validation, we identify that the key challenge in multimodal GCD lies in effectively aligning heterogeneous information across modalities. To address this, we propose MM-GCD, a novel framework that aligns both the feature and output spaces of different modalities using contrastive learning and distillation techniques. MM-GCD achieves new state-of-the-art performance on the UPMC-Food101 and N24News datasets, surpassing previous methods by 11.5\% and 4.7\%, respectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11624v1-abstract-full').style.display = 'none'; document.getElementById('2409.11624v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10966">arXiv:2409.10966</a> <span> [<a href="https://arxiv.org/pdf/2409.10966">pdf</a>, <a href="https://arxiv.org/format/2409.10966">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CUNSB-RFIE: Context-aware Unpaired Neural Schr枚dinger Bridge in Retinal Fundus Image Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Dong%2C+X">Xuanzhao Dong</a>, <a href="/search/cs?searchtype=author&query=Vasa%2C+V+K">Vamsi Krishna Vasa</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+W">Wenhui Zhu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+P">Peijie Qiu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiwen Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yi Su</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+Y">Yujian Xiong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhangsihao Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanxi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yalin Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10966v1-abstract-short" style="display: inline;"> Retinal fundus photography is significant in diagnosing and monitoring retinal diseases. However, systemic imperfections and operator/patient-related factors can hinder the acquisition of high-quality retinal images. Previous efforts in retinal image enhancement primarily relied on GANs, which are limited by the trade-off between training stability and output diversity. In contrast, the Schr枚dinge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10966v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10966v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10966v1-abstract-full" style="display: none;"> Retinal fundus photography is significant in diagnosing and monitoring retinal diseases. However, systemic imperfections and operator/patient-related factors can hinder the acquisition of high-quality retinal images. Previous efforts in retinal image enhancement primarily relied on GANs, which are limited by the trade-off between training stability and output diversity. In contrast, the Schr枚dinger Bridge (SB), offers a more stable solution by utilizing Optimal Transport (OT) theory to model a stochastic differential equation (SDE) between two arbitrary distributions. This allows SB to effectively transform low-quality retinal images into their high-quality counterparts. In this work, we leverage the SB framework to propose an image-to-image translation pipeline for retinal image enhancement. Additionally, previous methods often fail to capture fine structural details, such as blood vessels. To address this, we enhance our pipeline by introducing Dynamic Snake Convolution, whose tortuous receptive field can better preserve tubular structures. We name the resulting retinal fundus image enhancement framework the Context-aware Unpaired Neural Schr枚dinger Bridge (CUNSB-RFIE). To the best of our knowledge, this is the first endeavor to use the SB approach for retinal image enhancement. Experimental results on a large-scale dataset demonstrate the advantage of the proposed method compared to several state-of-the-art supervised and unsupervised methods in terms of image quality and performance on downstream tasks.The code is available at https://github.com/Retinal-Research/CUNSB-RFIE . <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10966v1-abstract-full').style.display = 'none'; document.getElementById('2409.10966v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.10252">arXiv:2409.10252</a> <span> [<a href="https://arxiv.org/pdf/2409.10252">pdf</a>, <a href="https://arxiv.org/format/2409.10252">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> eWAPA: An eBPF-based WASI Performance Analysis Framework for WebAssembly Runtimes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mao%2C+C">Chenxi Mao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuxin Su</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+S">Shiwen Shan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Dan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.10252v1-abstract-short" style="display: inline;"> WebAssembly (Wasm) is a low-level bytecode format that can run in modern browsers. With the development of standalone runtimes and the improvement of the WebAssembly System Interface (WASI), Wasm has further provided a more complete sandboxed runtime experience for server-side applications, effectively expanding its application scenarios. However, the implementation of WASI varies across different… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10252v1-abstract-full').style.display = 'inline'; document.getElementById('2409.10252v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.10252v1-abstract-full" style="display: none;"> WebAssembly (Wasm) is a low-level bytecode format that can run in modern browsers. With the development of standalone runtimes and the improvement of the WebAssembly System Interface (WASI), Wasm has further provided a more complete sandboxed runtime experience for server-side applications, effectively expanding its application scenarios. However, the implementation of WASI varies across different runtimes, and suboptimal interface implementations can lead to performance degradation during interactions between the runtime and the operating system. Existing research mainly focuses on overall performance evaluation of runtimes, while studies on WASI implementations are relatively scarce. To tackle this problem, we propose an eBPF-based WASI performance analysis framework. It collects key performance metrics of the runtime under different I/O load conditions, such as total execution time, startup time, WASI execution time, and syscall time. We can comprehensively analyze the performance of the runtime's I/O interactions with the operating system. Additionally, we provide a detailed analysis of the causes behind two specific WASI performance anomalies. These analytical results will guide the optimization of standalone runtimes and WASI implementations, enhancing their efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.10252v1-abstract-full').style.display = 'none'; document.getElementById('2409.10252v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.06702">arXiv:2409.06702</a> <span> [<a href="https://arxiv.org/pdf/2409.06702">pdf</a>, <a href="https://arxiv.org/format/2409.06702">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Hint-AD: Holistically Aligned Interpretability in End-to-End Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+K">Kairui Ding</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Boyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuchen Su</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huan-ang Gao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+B">Bu Jin</a>, <a href="/search/cs?searchtype=author&query=Sima%2C+C">Chonghao Sima</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wuqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaohui Li</a>, <a href="/search/cs?searchtype=author&query=Barsch%2C+P">Paul Barsch</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyang Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hao Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.06702v1-abstract-short" style="display: inline;"> End-to-end architectures in autonomous driving (AD) face a significant challenge in interpretability, impeding human-AI trust. Human-friendly natural language has been explored for tasks such as driving explanation and 3D captioning. However, previous works primarily focused on the paradigm of declarative interpretability, where the natural language interpretations are not grounded in the intermed… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06702v1-abstract-full').style.display = 'inline'; document.getElementById('2409.06702v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.06702v1-abstract-full" style="display: none;"> End-to-end architectures in autonomous driving (AD) face a significant challenge in interpretability, impeding human-AI trust. Human-friendly natural language has been explored for tasks such as driving explanation and 3D captioning. However, previous works primarily focused on the paradigm of declarative interpretability, where the natural language interpretations are not grounded in the intermediate outputs of AD systems, making the interpretations only declarative. In contrast, aligned interpretability establishes a connection between language and the intermediate outputs of AD systems. Here we introduce Hint-AD, an integrated AD-language system that generates language aligned with the holistic perception-prediction-planning outputs of the AD model. By incorporating the intermediate outputs and a holistic token mixer sub-network for effective feature adaptation, Hint-AD achieves desirable accuracy, achieving state-of-the-art results in driving language tasks including driving explanation, 3D dense captioning, and command prediction. To facilitate further study on driving explanation task on nuScenes, we also introduce a human-labeled dataset, Nu-X. Codes, dataset, and models will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.06702v1-abstract-full').style.display = 'none'; document.getElementById('2409.06702v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">CoRL 2024, Project Page: https://air-discover.github.io/Hint-AD/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04937">arXiv:2409.04937</a> <span> [<a href="https://arxiv.org/pdf/2409.04937">pdf</a>, <a href="https://arxiv.org/format/2409.04937">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> CONNECTOR: Enhancing the Traceability of Decentralized Bridge Applications via Automatic Cross-chain Transaction Association </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+D">Dan Lin</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajing Wu</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yuxin Su</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Ziye Zheng</a>, <a href="/search/cs?searchtype=author&query=Nan%2C+Y">Yuhong Nan</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04937v1-abstract-short" style="display: inline;"> Decentralized bridge applications are important software that connects various blockchains and facilitates cross-chain asset transfer in the decentralized finance (DeFi) ecosystem which currently operates in a multi-chain environment. Cross-chain transaction association identifies and matches unique transactions executed by bridge DApps, which is important research to enhance the traceability of c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04937v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04937v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04937v1-abstract-full" style="display: none;"> Decentralized bridge applications are important software that connects various blockchains and facilitates cross-chain asset transfer in the decentralized finance (DeFi) ecosystem which currently operates in a multi-chain environment. Cross-chain transaction association identifies and matches unique transactions executed by bridge DApps, which is important research to enhance the traceability of cross-chain bridge DApps. However, existing methods rely entirely on unobservable internal ledgers or APIs, violating the open and decentralized properties of blockchain. In this paper, we analyze the challenges of this issue and then present CONNECTOR, an automated cross-chain transaction association analysis method based on bridge smart contracts. Specifically, CONNECTOR first identifies deposit transactions by extracting distinctive and generic features from the transaction traces of bridge contracts. With the accurate deposit transactions, CONNECTOR mines the execution logs of bridge contracts to achieve withdrawal transaction matching. We conduct real-world experiments on different types of bridges to demonstrate the effectiveness of CONNECTOR. The experiment demonstrates that CONNECTOR successfully identifies 100% deposit transactions, associates 95.81% withdrawal transactions, and surpasses methods for CeFi bridges. Based on the association results, we obtain interesting findings about cross-chain transaction behaviors in DeFi bridges and analyze the tracing abilities of CONNECTOR to assist the DeFi bridge apps. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04937v1-abstract-full').style.display = 'none'; document.getElementById('2409.04937v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.02813">arXiv:2409.02813</a> <span> [<a href="https://arxiv.org/pdf/2409.02813">pdf</a>, <a href="https://arxiv.org/format/2409.02813">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMMU-Pro: A More Robust Multi-discipline Multimodal Understanding Benchmark </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+X">Xiang Yue</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+T">Tianyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Y">Yuansheng Ni</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yubo Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kai Zhang</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+S">Shengbang Tong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yuxuan Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Botao Yu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Ge Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Huan Sun</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yu Su</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenhu Chen</a>, <a href="/search/cs?searchtype=author&query=Neubig%2C+G">Graham Neubig</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.02813v2-abstract-short" style="display: inline;"> This paper introduces MMMU-Pro, a robust version of the Massive Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. MMMU-Pro rigorously assesses multimodal models' true understanding and reasoning capabilities through a three-step process based on MMMU: (1) filtering out questions answerable by text-only models, (2) augmenting candidate options, and (3) introducing a vision-o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02813v2-abstract-full').style.display = 'inline'; document.getElementById('2409.02813v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.02813v2-abstract-full" style="display: none;"> This paper introduces MMMU-Pro, a robust version of the Massive Multi-discipline Multimodal Understanding and Reasoning (MMMU) benchmark. MMMU-Pro rigorously assesses multimodal models' true understanding and reasoning capabilities through a three-step process based on MMMU: (1) filtering out questions answerable by text-only models, (2) augmenting candidate options, and (3) introducing a vision-only input setting where questions are embedded within images. This setting challenges AI to truly "see" and "read" simultaneously, testing a fundamental human cognitive skill of seamlessly integrating visual and textual information. Results show that model performance is substantially lower on MMMU-Pro than on MMMU, ranging from 16.8% to 26.9% across models. We explore the impact of OCR prompts and Chain of Thought (CoT) reasoning, finding that OCR prompts have minimal effect while CoT generally improves performance. MMMU-Pro provides a more rigorous evaluation tool, closely mimicking real-world scenarios and offering valuable directions for future research in multimodal AI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.02813v2-abstract-full').style.display = 'none'; document.getElementById('2409.02813v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01559">arXiv:2409.01559</a> <span> [<a href="https://arxiv.org/pdf/2409.01559">pdf</a>, <a href="https://arxiv.org/format/2409.01559">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PR2: A Physics- and Photo-realistic Testbed for Embodied AI and Humanoid Robots </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hangxin Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Q">Qi Xie</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+T">Tao Yuan</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+X">Xiaokun Leng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Lining Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+S">Song-Chun Zhu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jingwen Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhicheng He</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yao Su</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01559v1-abstract-short" style="display: inline;"> This paper presents the development of a Physics-realistic and Photo-\underline{r}ealistic humanoid robot testbed, PR2, to facilitate collaborative research between Embodied Artificial Intelligence (Embodied AI) and robotics. PR2 offers high-quality scene rendering and robot dynamic simulation, enabling (i) the creation of diverse scenes using various digital assets, (ii) the integration of advanc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01559v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01559v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01559v1-abstract-full" style="display: none;"> This paper presents the development of a Physics-realistic and Photo-\underline{r}ealistic humanoid robot testbed, PR2, to facilitate collaborative research between Embodied Artificial Intelligence (Embodied AI) and robotics. PR2 offers high-quality scene rendering and robot dynamic simulation, enabling (i) the creation of diverse scenes using various digital assets, (ii) the integration of advanced perception or foundation models, and (iii) the implementation of planning and control algorithms for dynamic humanoid robot behaviors based on environmental feedback. The beta version of PR2 has been deployed for the simulation track of a nationwide full-size humanoid robot competition for college students, attracting 137 teams and over 400 participants within four months. This competition covered traditional tasks in bipedal walking, as well as novel challenges in loco-manipulation and language-instruction-based object search, marking a first for public college robotics competitions. A retrospective analysis of the competition suggests that future events should emphasize the integration of locomotion with manipulation and perception. By making the PR2 testbed publicly available at https://github.com/pr2-humanoid/PR2-Platform, we aim to further advance education and training in humanoid robotics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01559v1-abstract-full').style.display = 'none'; document.getElementById('2409.01559v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Su%2C+Y&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Su%2C+Y&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository