Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 269 results for author: <span class="mathjax">Deng, H</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Deng%2C+H">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Deng, H"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Deng%2C+H&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Deng, H"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Deng%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.10273">arXiv:2502.10273</a> <span> [<a href="https://arxiv.org/pdf/2502.10273">pdf</a>, <a href="https://arxiv.org/format/2502.10273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Probing Perceptual Constancy in Large Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoran Sun</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+S">Suyang Yu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingying Gao</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Haiyun Lyu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dezhi Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.10273v1-abstract-short" style="display: inline;"> Perceptual constancy is the ability to maintain stable perceptions of objects despite changes in sensory input, such as variations in distance, angle, or lighting. This ability is crucial for recognizing visual information in a dynamic world, making it essential for Vision-Language Models (VLMs). However, whether VLMs are currently and theoretically capable of mastering this ability remains undere… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10273v1-abstract-full').style.display = 'inline'; document.getElementById('2502.10273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.10273v1-abstract-full" style="display: none;"> Perceptual constancy is the ability to maintain stable perceptions of objects despite changes in sensory input, such as variations in distance, angle, or lighting. This ability is crucial for recognizing visual information in a dynamic world, making it essential for Vision-Language Models (VLMs). However, whether VLMs are currently and theoretically capable of mastering this ability remains underexplored. In this study, we evaluated 33 VLMs using 253 experiments across three domains: color, size, and shape constancy. The experiments included single-image and video adaptations of classic cognitive tasks, along with novel tasks in in-the-wild conditions, to evaluate the models' recognition of object properties under varying conditions. We found significant variability in VLM performance, with models performance in shape constancy clearly dissociated from that of color and size constancy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.10273v1-abstract-full').style.display = 'none'; document.getElementById('2502.10273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.08353">arXiv:2502.08353</a> <span> [<a href="https://arxiv.org/pdf/2502.08353">pdf</a>, <a href="https://arxiv.org/format/2502.08353">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Trustworthy GNNs with LLMs: A Systematic Review and Taxonomy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+R">Ruizhan Xue</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huimin Deng</a>, <a href="/search/cs?searchtype=author&query=He%2C+F">Fang He</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Maojun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.08353v1-abstract-short" style="display: inline;"> With the extensive application of Graph Neural Networks (GNNs) across various domains, their trustworthiness has emerged as a focal point of research. Some existing studies have shown that the integration of large language models (LLMs) can improve the semantic understanding and generation capabilities of GNNs, which in turn improves the trustworthiness of GNNs from various aspects. Our review int… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08353v1-abstract-full').style.display = 'inline'; document.getElementById('2502.08353v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.08353v1-abstract-full" style="display: none;"> With the extensive application of Graph Neural Networks (GNNs) across various domains, their trustworthiness has emerged as a focal point of research. Some existing studies have shown that the integration of large language models (LLMs) can improve the semantic understanding and generation capabilities of GNNs, which in turn improves the trustworthiness of GNNs from various aspects. Our review introduces a taxonomy that offers researchers a clear framework for comprehending the principles and applications of different methods and helps clarify the connections and differences among various approaches. Then we systematically survey representative approaches along the four categories of our taxonomy. Through our taxonomy, researchers can understand the applicable scenarios, potential advantages, and limitations of each approach for the the trusted integration of GNNs with LLMs. Finally, we present some promising directions of work and future trends for the integration of LLMs and GNNs to improve model trustworthiness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.08353v1-abstract-full').style.display = 'none'; document.getElementById('2502.08353v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to IJCAI 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07287">arXiv:2502.07287</a> <span> [<a href="https://arxiv.org/pdf/2502.07287">pdf</a>, <a href="https://arxiv.org/format/2502.07287">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Diverse Perspectives on AI: Examining People's Acceptability and Reasoning of Possible AI Use Cases </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mun%2C+J">Jimin Mun</a>, <a href="/search/cs?searchtype=author&query=Yeong%2C+W+B+A">Wei Bin Au Yeong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Borg%2C+J+S">Jana Schaich Borg</a>, <a href="/search/cs?searchtype=author&query=Sap%2C+M">Maarten Sap</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07287v1-abstract-short" style="display: inline;"> In recent years, there has been a growing recognition of the need to incorporate lay-people's input into the governance and acceptability assessment of AI usage. However, how and why people judge different AI use cases to be acceptable or unacceptable remains under-explored. In this work, we investigate the attitudes and reasons that influence people's judgments about AI's development via a survey… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07287v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07287v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07287v1-abstract-full" style="display: none;"> In recent years, there has been a growing recognition of the need to incorporate lay-people's input into the governance and acceptability assessment of AI usage. However, how and why people judge different AI use cases to be acceptable or unacceptable remains under-explored. In this work, we investigate the attitudes and reasons that influence people's judgments about AI's development via a survey administered to demographically diverse participants (N=197). We focus on ten distinct professional (e.g., Lawyer AI) and personal (e.g., Digital Medical Advice AI) AI use cases to understand how characteristics of the use cases and the participants' demographics affect acceptability. We explore the relationships between participants' judgments and their rationales such as reasoning approaches (cost-benefit reasoning vs. rule-based). Our empirical findings reveal number of factors that influence acceptance such as general negative acceptance and higher disagreement of professional usage over personal, significant influence of demographics factors such as gender, employment, and education as well as AI literacy level, and reasoning patterns such as rule-based reasoning being used more when use case is unacceptable. Based on these findings, we discuss the key implications for soliciting acceptability and reasoning of AI use cases to collaboratively build consensus. Finally, we shed light on how future FAccT researchers and practitioners can better incorporate diverse perspectives from lay people to better develop AI that aligns with public expectations and needs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07287v1-abstract-full').style.display = 'none'; document.getElementById('2502.07287v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34 pages, 35 tables, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07282">arXiv:2502.07282</a> <span> [<a href="https://arxiv.org/pdf/2502.07282">pdf</a>, <a href="https://arxiv.org/format/2502.07282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Leader-follower formation enabled by pressure sensing in free-swimming undulatory robotic fish </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Panta%2C+K">Kundan Panta</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hankun Deng</a>, <a href="/search/cs?searchtype=author&query=DeLattre%2C+M">Micah DeLattre</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+B">Bo Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07282v1-abstract-short" style="display: inline;"> Fish use their lateral lines to sense flows and pressure gradients, enabling them to detect nearby objects and organisms. Towards replicating this capability, we demonstrated successful leader-follower formation swimming using flow pressure sensing in our undulatory robotic fish ($渭$Bot/MUBot). The follower $渭$Bot is equipped at its head with bilateral pressure sensors to detect signals excited by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07282v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07282v1-abstract-full" style="display: none;"> Fish use their lateral lines to sense flows and pressure gradients, enabling them to detect nearby objects and organisms. Towards replicating this capability, we demonstrated successful leader-follower formation swimming using flow pressure sensing in our undulatory robotic fish ($渭$Bot/MUBot). The follower $渭$Bot is equipped at its head with bilateral pressure sensors to detect signals excited by both its own and the leader's movements. First, using experiments with static formations between an undulating leader and a stationary follower, we determined the formation that resulted in strong pressure variations measured by the follower. This formation was then selected as the desired formation in free swimming for obtaining an expert policy. Next, a long short-term memory neural network was used as the control policy that maps the pressure signals along with the robot motor commands and the Euler angles (measured by the onboard IMU) to the steering command. The policy was trained to imitate the expert policy using behavior cloning and Dataset Aggregation (DAgger). The results show that with merely two bilateral pressure sensors and less than one hour of training data, the follower effectively tracked the leader within distances of up to 200 mm (= 1 body length) while swimming at speeds of 155 mm/s (= 0.8 body lengths/s). This work highlights the potential of fish-inspired robots to effectively navigate fluid environments and achieve formation swimming through the use of flow pressure feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07282v1-abstract-full').style.display = 'none'; document.getElementById('2502.07282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">7 pages, 10 figures. Accepted for 2025 IEEE International Conference on Robotics and Automation (ICRA). Supplementary video: https://youtu.be/DIDYGi9Td0I</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.06788">arXiv:2502.06788</a> <span> [<a href="https://arxiv.org/pdf/2502.06788">pdf</a>, <a href="https://arxiv.org/format/2502.06788">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EVEv2: Improved Baselines for Encoder-Free Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaotong Li</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yufeng Cui</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yueze Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoge Deng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Ting Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenxuan Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.06788v1-abstract-short" style="display: inline;"> Existing encoder-free vision-language models (VLMs) are rapidly narrowing the performance gap with their encoder-based counterparts, highlighting the promising potential for unified multimodal systems with structural simplicity and efficient deployment. We systematically clarify the performance gap between VLMs using pre-trained vision encoders, discrete tokenizers, and minimalist visual layers fr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06788v1-abstract-full').style.display = 'inline'; document.getElementById('2502.06788v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.06788v1-abstract-full" style="display: none;"> Existing encoder-free vision-language models (VLMs) are rapidly narrowing the performance gap with their encoder-based counterparts, highlighting the promising potential for unified multimodal systems with structural simplicity and efficient deployment. We systematically clarify the performance gap between VLMs using pre-trained vision encoders, discrete tokenizers, and minimalist visual layers from scratch, deeply excavating the under-examined characteristics of encoder-free VLMs. We develop efficient strategies for encoder-free VLMs that rival mainstream encoder-based ones. After an in-depth investigation, we launch EVEv2.0, a new and improved family of encoder-free VLMs. We show that: (i) Properly decomposing and hierarchically associating vision and language within a unified model reduces interference between modalities. (ii) A well-designed training strategy enables effective optimization for encoder-free VLMs. Through extensive evaluation, our EVEv2.0 represents a thorough study for developing a decoder-only architecture across modalities, demonstrating superior data efficiency and strong vision-reasoning capability. Code is publicly available at: https://github.com/baaivision/EVE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.06788v1-abstract-full').style.display = 'none'; document.getElementById('2502.06788v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.01885">arXiv:2502.01885</a> <span> [<a href="https://arxiv.org/pdf/2502.01885">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> A Privacy-Preserving Domain Adversarial Federated learning for multi-site brain functional connectivity analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yipu Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Likai Wang</a>, <a href="/search/cs?searchtype=author&query=Su%2C+K">Kuan-Jui Su</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+A">Aiying Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Hao Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaowen Liu</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Calhoun%2C+V+D">Vince D. Calhoun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yuping Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hongwen Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.01885v1-abstract-short" style="display: inline;"> Resting-state functional magnetic resonance imaging (rs-fMRI) and its derived functional connectivity networks (FCNs) have become critical for understanding neurological disorders. However, collaborative analyses and the generalizability of models still face significant challenges due to privacy regulations and the non-IID (non-independent and identically distributed) property of multiple data sou… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01885v1-abstract-full').style.display = 'inline'; document.getElementById('2502.01885v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.01885v1-abstract-full" style="display: none;"> Resting-state functional magnetic resonance imaging (rs-fMRI) and its derived functional connectivity networks (FCNs) have become critical for understanding neurological disorders. However, collaborative analyses and the generalizability of models still face significant challenges due to privacy regulations and the non-IID (non-independent and identically distributed) property of multiple data sources. To mitigate these difficulties, we propose Domain Adversarial Federated Learning (DAFed), a novel federated deep learning framework specifically designed for non-IID fMRI data analysis in multi-site settings. DAFed addresses these challenges through feature disentanglement, decomposing the latent feature space into domain-invariant and domain-specific components, to ensure robust global learning while preserving local data specificity. Furthermore, adversarial training facilitates effective knowledge transfer between labeled and unlabeled datasets, while a contrastive learning module enhances the global representation of domain-invariant features. We evaluated DAFed on the diagnosis of ASD and further validated its generalizability in the classification of AD, demonstrating its superior classification accuracy compared to state-of-the-art methods. Additionally, an enhanced Score-CAM module identifies key brain regions and functional connectivity significantly associated with ASD and MCI, respectively, uncovering shared neurobiological patterns across sites. These findings highlight the potential of DAFed to advance multi-site collaborative research in neuroimaging while protecting data confidentiality. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.01885v1-abstract-full').style.display = 'none'; document.getElementById('2502.01885v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">34pages, 13 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.10383">arXiv:2501.10383</a> <span> [<a href="https://arxiv.org/pdf/2501.10383">pdf</a>, <a href="https://arxiv.org/format/2501.10383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> The Generative AI Ethics Playbook </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Smith%2C+J+J">Jessie J. Smith</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Smith%2C+W+H">William H. Smith</a>, <a href="/search/cs?searchtype=author&query=Sap%2C+M">Maarten Sap</a>, <a href="/search/cs?searchtype=author&query=DeCario%2C+N">Nicole DeCario</a>, <a href="/search/cs?searchtype=author&query=Dodge%2C+J">Jesse Dodge</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.10383v1-abstract-short" style="display: inline;"> The Generative AI Ethics Playbook provides guidance for identifying and mitigating risks of machine learning systems across various domains, including natural language processing, computer vision, and generative AI. This playbook aims to assist practitioners in diagnosing potential harms that may arise during the design, development, and deployment of datasets and models. It offers concrete strate… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10383v1-abstract-full').style.display = 'inline'; document.getElementById('2501.10383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.10383v1-abstract-full" style="display: none;"> The Generative AI Ethics Playbook provides guidance for identifying and mitigating risks of machine learning systems across various domains, including natural language processing, computer vision, and generative AI. This playbook aims to assist practitioners in diagnosing potential harms that may arise during the design, development, and deployment of datasets and models. It offers concrete strategies and resources for mitigating these risks, to help minimize negative impacts on users and society. Drawing on current best practices in both research and ethical considerations, this playbook aims to serve as a comprehensive resource for AI/ML practitioners. The intended audience of this playbook includes machine learning researchers, engineers, and practitioners who are involved in the creation and implementation of generative and multimodal models (e.g., text-to-text, image-to-image, text-to-image, text-to-video). Specifically, we provide transparency/documentation checklists, topics of interest, common questions, examples of harms through case studies, and resources and strategies to mitigate harms throughout the Generative AI lifecycle. This playbook was made collaboratively over the course of 16 months through extensive literature review of over 100 resources and peer-reviewed articles, as well as through an initial group brainstorming session with 18 interdisciplinary AI ethics experts from industry and academia, and with additional feedback from 8 experts (5 of whom were in the initial brainstorming session). We note that while this playbook provides examples, discussion, and harm mitigation strategies, research in this area is ongoing. Our playbook aims to be a practically useful survey, taking a high-level view rather than aiming for covering the entire existing body of research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.10383v1-abstract-full').style.display = 'none'; document.getElementById('2501.10383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.01397">arXiv:2501.01397</a> <span> [<a href="https://arxiv.org/pdf/2501.01397">pdf</a>, <a href="https://arxiv.org/format/2501.01397">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> WeAudit: Scaffolding User Auditors and AI Practitioners in Auditing Generative AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Claire%2C+W">Wang Claire</a>, <a href="/search/cs?searchtype=author&query=Han%2C+H+Z">Howard Ziyu Han</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+J+I">Jason I. Hong</a>, <a href="/search/cs?searchtype=author&query=Holstein%2C+K">Kenneth Holstein</a>, <a href="/search/cs?searchtype=author&query=Eslami%2C+M">Motahhare Eslami</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.01397v3-abstract-short" style="display: inline;"> There has been growing interest from both practitioners and researchers in engaging end users in AI auditing, to draw upon users' unique knowledge and lived experiences. However, we know little about how to effectively scaffold end users in auditing in ways that can generate actionable insights for AI practitioners. Through formative studies with both users and AI practitioners, we first identifie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01397v3-abstract-full').style.display = 'inline'; document.getElementById('2501.01397v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.01397v3-abstract-full" style="display: none;"> There has been growing interest from both practitioners and researchers in engaging end users in AI auditing, to draw upon users' unique knowledge and lived experiences. However, we know little about how to effectively scaffold end users in auditing in ways that can generate actionable insights for AI practitioners. Through formative studies with both users and AI practitioners, we first identified a set of design goals to support user-engaged AI auditing. We then developed WeAudit, a workflow and system that supports end users in auditing AI both individually and collectively. We evaluated WeAudit through a three-week user study with user auditors and interviews with industry Generative AI practitioners. Our findings offer insights into how WeAudit supports users in noticing and reflecting upon potential AI harms and in articulating their findings in ways that industry practitioners can act upon. Based on our observations and feedback from both users and practitioners, we identify several opportunities to better support user engagement in AI auditing processes. We discuss implications for future research to support effective and responsible user engagement in AI auditing and red-teaming. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.01397v3-abstract-full').style.display = 'none'; document.getElementById('2501.01397v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.19991">arXiv:2412.19991</a> <span> [<a href="https://arxiv.org/pdf/2412.19991">pdf</a>, <a href="https://arxiv.org/format/2412.19991">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> A Robust Federated Learning Framework for Undependable Devices at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shilong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jianchun Liu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+H">Hongli Xu</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+C">Chunming Qiao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huarong Deng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Q">Qiuye Zheng</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+J">Jiantao Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.19991v1-abstract-short" style="display: inline;"> In a federated learning (FL) system, many devices, such as smartphones, are often undependable (e.g., frequently disconnected from WiFi) during training. Existing FL frameworks always assume a dependable environment and exclude undependable devices from training, leading to poor model performance and resource wastage. In this paper, we propose FLUDE to effectively deal with undependable environmen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19991v1-abstract-full').style.display = 'inline'; document.getElementById('2412.19991v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.19991v1-abstract-full" style="display: none;"> In a federated learning (FL) system, many devices, such as smartphones, are often undependable (e.g., frequently disconnected from WiFi) during training. Existing FL frameworks always assume a dependable environment and exclude undependable devices from training, leading to poor model performance and resource wastage. In this paper, we propose FLUDE to effectively deal with undependable environments. First, FLUDE assesses the dependability of devices based on the probability distribution of their historical behaviors (e.g., the likelihood of successfully completing training). Based on this assessment, FLUDE adaptively selects devices with high dependability for training. To mitigate resource wastage during the training phase, FLUDE maintains a model cache on each device, aiming to preserve the latest training state for later use in case local training on an undependable device is interrupted. Moreover, FLUDE proposes a staleness-aware strategy to judiciously distribute the global model to a subset of devices, thus significantly reducing resource wastage while maintaining model performance. We have implemented FLUDE on two physical platforms with 120 smartphones and NVIDIA Jetson devices. Extensive experimental results demonstrate that FLUDE can effectively improve model performance and resource efficiency of FL training in undependable environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.19991v1-abstract-full').style.display = 'none'; document.getElementById('2412.19991v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14169">arXiv:2412.14169</a> <span> [<a href="https://arxiv.org/pdf/2412.14169">pdf</a>, <a href="https://arxiv.org/format/2412.14169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Autoregressive Video Generation without Vector Quantization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoge Deng</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Ting Pan</a>, <a href="/search/cs?searchtype=author&query=Diao%2C+H">Haiwen Diao</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhengxiong Luo</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Y">Yufeng Cui</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Shan%2C+S">Shiguang Shan</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yonggang Qi</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14169v1-abstract-short" style="display: inline;"> This paper presents a novel approach that enables autoregressive video generation with high efficiency. We propose to reformulate the video generation problem as a non-quantized autoregressive modeling of temporal frame-by-frame prediction and spatial set-by-set prediction. Unlike raster-scan prediction in prior autoregressive models or joint distribution modeling of fixed-length tokens in diffusi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14169v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14169v1-abstract-full" style="display: none;"> This paper presents a novel approach that enables autoregressive video generation with high efficiency. We propose to reformulate the video generation problem as a non-quantized autoregressive modeling of temporal frame-by-frame prediction and spatial set-by-set prediction. Unlike raster-scan prediction in prior autoregressive models or joint distribution modeling of fixed-length tokens in diffusion models, our approach maintains the causal property of GPT-style models for flexible in-context capabilities, while leveraging bidirectional modeling within individual frames for efficiency. With the proposed approach, we train a novel video autoregressive model without vector quantization, termed NOVA. Our results demonstrate that NOVA surpasses prior autoregressive video models in data efficiency, inference speed, visual fidelity, and video fluency, even with a much smaller model capacity, i.e., 0.6B parameters. NOVA also outperforms state-of-the-art image diffusion models in text-to-image generation tasks, with a significantly lower training cost. Additionally, NOVA generalizes well across extended video durations and enables diverse zero-shot applications in one unified model. Code and models are publicly available at https://github.com/baaivision/NOVA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14169v1-abstract-full').style.display = 'none'; document.getElementById('2412.14169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22 pages, 16 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.12223">arXiv:2412.12223</a> <span> [<a href="https://arxiv.org/pdf/2412.12223">pdf</a>, <a href="https://arxiv.org/format/2412.12223">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can video generation replace cinematographers? Research on the cinematic language of generated video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiaozhe Li</a>, <a href="/search/cs?searchtype=author&query=WU%2C+K">Kai WU</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Siyi Yang</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+Y">YiZhan Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+G">Guohua. Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhiyu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayao Li</a>, <a href="/search/cs?searchtype=author&query=Mu%2C+J">Jiangchuan Mu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+X">Xiaobin Hu</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+W">Wen Fang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+M">Mingliang Xiong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hao Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qingwen Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=He%2C+B">Bin He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.12223v1-abstract-short" style="display: inline;"> Recent advancements in text-to-video (T2V) generation have leveraged diffusion models to enhance the visual coherence of videos generated from textual descriptions. However, most research has primarily focused on object motion, with limited attention given to cinematic language in videos, which is crucial for cinematographers to convey emotion and narrative pacing. To address this limitation, we p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12223v1-abstract-full').style.display = 'inline'; document.getElementById('2412.12223v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.12223v1-abstract-full" style="display: none;"> Recent advancements in text-to-video (T2V) generation have leveraged diffusion models to enhance the visual coherence of videos generated from textual descriptions. However, most research has primarily focused on object motion, with limited attention given to cinematic language in videos, which is crucial for cinematographers to convey emotion and narrative pacing. To address this limitation, we propose a threefold approach to enhance the ability of T2V models to generate controllable cinematic language. Specifically, we introduce a cinematic language dataset that encompasses shot framing, angle, and camera movement, enabling models to learn diverse cinematic styles. Building on this, to facilitate robust cinematic alignment evaluation, we present CameraCLIP, a model fine-tuned on the proposed dataset that excels in understanding complex cinematic language in generated videos and can further provide valuable guidance in the multi-shot composition process. Finally, we propose CLIPLoRA, a cost-guided dynamic LoRA composition method that facilitates smooth transitions and realistic blending of cinematic language by dynamically fusing multiple pre-trained cinematic LoRAs within a single video. Our experiments demonstrate that CameraCLIP outperforms existing models in assessing the alignment between cinematic language and video, achieving an R@1 score of 0.81. Additionally, CLIPLoRA improves the ability for multi-shot composition, potentially bridging the gap between automatically generated videos and those shot by professional cinematographers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.12223v1-abstract-full').style.display = 'none'; document.getElementById('2412.12223v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07803">arXiv:2412.07803</a> <span> [<a href="https://arxiv.org/pdf/2412.07803">pdf</a>, <a href="https://arxiv.org/format/2412.07803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Pattern Tree: Enhancing Efficiency in Quantum Circuit Optimization Based on Pattern-matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingyu Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zhaoyu Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongshang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoning Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07803v1-abstract-short" style="display: inline;"> Quantum circuit optimization is essential for improving the performance of quantum algorithms, particularly on Noisy Intermediate-Scale Quantum (NISQ) devices with limited qubit connectivity and high error rates. Pattern matching has proven to be an effective technique for identifying and optimizing subcircuits by replacing them with functionally equivalent, efficient versions, including reducing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07803v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07803v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07803v1-abstract-full" style="display: none;"> Quantum circuit optimization is essential for improving the performance of quantum algorithms, particularly on Noisy Intermediate-Scale Quantum (NISQ) devices with limited qubit connectivity and high error rates. Pattern matching has proven to be an effective technique for identifying and optimizing subcircuits by replacing them with functionally equivalent, efficient versions, including reducing circuit depth and facilitating platform portability. However, existing approaches face challenges in handling large-scale circuits and numerous transformation rules, often leading to redundant matches and increased compilation time. In this study, we propose a novel framework for quantum circuit optimization based on pattern matching to enhance its efficiency. Observing redundancy in applying existing transformation rules, our method employs a pattern tree structure to organize these rules, reducing redundant operations during the execution of the pattern-matching algorithm and improving matching efficiency. We design and implement a compilation framework to demonstrate the practicality of the pattern tree approach. Experimental results show that pattern-tree-based pattern matching can reduce execution time by an average of 20% on a well-accepted benchmark set. Furthermore, we analyze how to build a pattern tree to maximize the optimization of compilation time. The evaluation results demonstrate that our approach has the potential to optimize compilation time by 90%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07803v1-abstract-full').style.display = 'none'; document.getElementById('2412.07803v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.07230">arXiv:2412.07230</a> <span> [<a href="https://arxiv.org/pdf/2412.07230">pdf</a>, <a href="https://arxiv.org/format/2412.07230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Deep Non-rigid Structure-from-Motion Revisited: Canonicalization and Sequence Modeling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hui Deng</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiawei Shi</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Zhen Qin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+Y">Yiran Zhong</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Y">Yuchao Dai</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.07230v1-abstract-short" style="display: inline;"> Non-Rigid Structure-from-Motion (NRSfM) is a classic 3D vision problem, where a 2D sequence is taken as input to estimate the corresponding 3D sequence. Recently, the deep neural networks have greatly advanced the task of NRSfM. However, existing deep NRSfM methods still have limitations in handling the inherent sequence property and motion ambiguity associated with the NRSfM problem. In this pape… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07230v1-abstract-full').style.display = 'inline'; document.getElementById('2412.07230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.07230v1-abstract-full" style="display: none;"> Non-Rigid Structure-from-Motion (NRSfM) is a classic 3D vision problem, where a 2D sequence is taken as input to estimate the corresponding 3D sequence. Recently, the deep neural networks have greatly advanced the task of NRSfM. However, existing deep NRSfM methods still have limitations in handling the inherent sequence property and motion ambiguity associated with the NRSfM problem. In this paper, we revisit deep NRSfM from two perspectives to address the limitations of current deep NRSfM methods : (1) canonicalization and (2) sequence modeling. We propose an easy-to-implement per-sequence canonicalization method as opposed to the previous per-dataset canonicalization approaches. With this in mind, we propose a sequence modeling method that combines temporal information and subspace constraint. As a result, we have achieved a more optimal NRSfM reconstruction pipeline compared to previous efforts. The effectiveness of our method is verified by testing the sequence-to-sequence deep NRSfM pipeline with corresponding regularization modules on several commonly used datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.07230v1-abstract-full').style.display = 'none'; document.getElementById('2412.07230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages main text, 7 pages appendix</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06779">arXiv:2412.06779</a> <span> [<a href="https://arxiv.org/pdf/2412.06779">pdf</a>, <a href="https://arxiv.org/format/2412.06779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> AnyBimanual: Transferring Unimanual Policy for General Bimanual Manipulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+G">Guanxing Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tengbo Yu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoyuan Deng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+S">Season Si Chen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yansong Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06779v1-abstract-short" style="display: inline;"> Performing general language-conditioned bimanual manipulation tasks is of great importance for many applications ranging from household service to industrial assembly. However, collecting bimanual manipulation data is expensive due to the high-dimensional action space, which poses challenges for conventional methods to handle general bimanual manipulation tasks. In contrast, unimanual policy has r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06779v1-abstract-full').style.display = 'inline'; document.getElementById('2412.06779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06779v1-abstract-full" style="display: none;"> Performing general language-conditioned bimanual manipulation tasks is of great importance for many applications ranging from household service to industrial assembly. However, collecting bimanual manipulation data is expensive due to the high-dimensional action space, which poses challenges for conventional methods to handle general bimanual manipulation tasks. In contrast, unimanual policy has recently demonstrated impressive generalizability across a wide range of tasks because of scaled model parameters and training data, which can provide sharable manipulation knowledge for bimanual systems. To this end, we propose a plug-and-play method named AnyBimanual, which transfers pre-trained unimanual policy to general bimanual manipulation policy with few bimanual demonstrations. Specifically, we first introduce a skill manager to dynamically schedule the skill representations discovered from pre-trained unimanual policy for bimanual manipulation tasks, which linearly combines skill primitives with task-oriented compensation to represent the bimanual manipulation instruction. To mitigate the observation discrepancy between unimanual and bimanual systems, we present a visual aligner to generate soft masks for visual embedding of the workspace, which aims to align visual input of unimanual policy model for each arm with those during pretraining stage. AnyBimanual shows superiority on 12 simulated tasks from RLBench2 with a sizable 12.67% improvement in success rate over previous methods. Experiments on 9 real-world tasks further verify its practicality with an average success rate of 84.62%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06779v1-abstract-full').style.display = 'none'; document.getElementById('2412.06779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://anybimanual.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.06699">arXiv:2412.06699</a> <span> [<a href="https://arxiv.org/pdf/2412.06699">pdf</a>, <a href="https://arxiv.org/format/2412.06699">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> You See it, You Got it: Learning 3D Creation on Pose-Free Videos at Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ma%2C+B">Baorui Ma</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Huachen Gao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoge Deng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Z">Zhengxiong Luo</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+T">Tiejun Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+L">Lulu Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinlong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.06699v2-abstract-short" style="display: inline;"> Recent 3D generation models typically rely on limited-scale 3D `gold-labels' or 2D diffusion priors for 3D content creation. However, their performance is upper-bounded by constrained 3D priors due to the lack of scalable learning paradigms. In this work, we present See3D, a visual-conditional multi-view diffusion model trained on large-scale Internet videos for open-world 3D creation. The model a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06699v2-abstract-full').style.display = 'inline'; document.getElementById('2412.06699v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.06699v2-abstract-full" style="display: none;"> Recent 3D generation models typically rely on limited-scale 3D `gold-labels' or 2D diffusion priors for 3D content creation. However, their performance is upper-bounded by constrained 3D priors due to the lack of scalable learning paradigms. In this work, we present See3D, a visual-conditional multi-view diffusion model trained on large-scale Internet videos for open-world 3D creation. The model aims to Get 3D knowledge by solely Seeing the visual contents from the vast and rapidly growing video data -- You See it, You Got it. To achieve this, we first scale up the training data using a proposed data curation pipeline that automatically filters out multi-view inconsistencies and insufficient observations from source videos. This results in a high-quality, richly diverse, large-scale dataset of multi-view images, termed WebVi3D, containing 320M frames from 16M video clips. Nevertheless, learning generic 3D priors from videos without explicit 3D geometry or camera pose annotations is nontrivial, and annotating poses for web-scale videos is prohibitively expensive. To eliminate the need for pose conditions, we introduce an innovative visual-condition - a purely 2D-inductive visual signal generated by adding time-dependent noise to the masked video data. Finally, we introduce a novel visual-conditional 3D generation framework by integrating See3D into a warping-based pipeline for high-fidelity 3D generation. Our numerical and visual comparisons on single and sparse reconstruction benchmarks show that See3D, trained on cost-effective and scalable video data, achieves notable zero-shot and open-world generation capabilities, markedly outperforming models trained on costly and constrained 3D datasets. Please refer to our project page at: https://vision.baai.ac.cn/see3d <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.06699v2-abstract-full').style.display = 'none'; document.getElementById('2412.06699v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 9 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://vision.baai.ac.cn/see3d</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.05271">arXiv:2412.05271</a> <span> [<a href="https://arxiv.org/pdf/2412.05271">pdf</a>, <a href="https://arxiv.org/format/2412.05271">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhe Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiyun Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yue Cao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yangzhou Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Z">Zhangwei Gao</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+E">Erfei Cui</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jinguo Zhu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+S">Shenglong Ye</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+H">Hao Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhaoyang Liu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+L">Lixin Gu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xuehui Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qingyun Li</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yimin Ren</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiapeng Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiahao Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+T">Tan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bo Wang</a>, <a href="/search/cs?searchtype=author&query=He%2C+C">Conghui He</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+B">Botian Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+H">Han Lv</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+W">Wenqi Shao</a> , et al. (17 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.05271v4-abstract-short" style="display: inline;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'inline'; document.getElementById('2412.05271v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.05271v4-abstract-full" style="display: none;"> We introduce InternVL 2.5, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality. In this work, we delve into the relationship between model scaling and performance, systematically exploring the performance trends in vision encoders, language models, dataset sizes, and test-time configurations. Through extensive evaluations on a wide range of benchmarks, including multi-discipline reasoning, document understanding, multi-image / video understanding, real-world comprehension, multimodal hallucination detection, visual grounding, multilingual capabilities, and pure language processing, InternVL 2.5 exhibits competitive performance, rivaling leading commercial models such as GPT-4o and Claude-3.5-Sonnet. Notably, our model is the first open-source MLLMs to surpass 70% on the MMMU benchmark, achieving a 3.7-point improvement through Chain-of-Thought (CoT) reasoning and showcasing strong potential for test-time scaling. We hope this model contributes to the open-source community by setting new standards for developing and applying multimodal AI systems. HuggingFace demo see https://huggingface.co/spaces/OpenGVLab/InternVL <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.05271v4-abstract-full').style.display = 'none'; document.getElementById('2412.05271v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.18990">arXiv:2411.18990</a> <span> [<a href="https://arxiv.org/pdf/2411.18990">pdf</a>, <a href="https://arxiv.org/format/2411.18990">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.18653/v1/2024.semeval-1.126">10.18653/v1/2024.semeval-1.126 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> USTCCTSU at SemEval-2024 Task 1: Reducing Anisotropy for Cross-lingual Semantic Textual Relatedness Task </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jianjian Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+S">Shengwei Liang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Y">Yong Liao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hongping Deng</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+H">Haiyang Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.18990v1-abstract-short" style="display: inline;"> Cross-lingual semantic textual relatedness task is an important research task that addresses challenges in cross-lingual communication and text understanding. It helps establish semantic connections between different languages, crucial for downstream tasks like machine translation, multilingual information retrieval, and cross-lingual text understanding.Based on extensive comparative experiments,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18990v1-abstract-full').style.display = 'inline'; document.getElementById('2411.18990v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.18990v1-abstract-full" style="display: none;"> Cross-lingual semantic textual relatedness task is an important research task that addresses challenges in cross-lingual communication and text understanding. It helps establish semantic connections between different languages, crucial for downstream tasks like machine translation, multilingual information retrieval, and cross-lingual text understanding.Based on extensive comparative experiments, we choose the XLM-R-base as our base model and use pre-trained sentence representations based on whitening to reduce anisotropy.Additionally, for the given training data, we design a delicate data filtering method to alleviate the curse of multilingualism. With our approach, we achieve a 2nd score in Spanish, a 3rd in Indonesian, and multiple entries in the top ten results in the competition's track C. We further do a comprehensive analysis to inspire future research aimed at improving performance on cross-lingual tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.18990v1-abstract-full').style.display = 'none'; document.getElementById('2411.18990v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> In Proceedings of the 18th International Workshop on Semantic Evaluation (SemEval-2024), pages 881-887 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14055">arXiv:2411.14055</a> <span> [<a href="https://arxiv.org/pdf/2411.14055">pdf</a>, <a href="https://arxiv.org/format/2411.14055">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> DRPruning: Efficient Large Language Model Pruning through Distributionally Robust Optimization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hexuan Deng</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14055v1-abstract-short" style="display: inline;"> Large language models (LLMs) deliver impressive results but face challenges from increasing model sizes and computational costs. Structured pruning reduces model size and speeds up inference but often causes uneven degradation across domains, leading to biased performance. To address this, we propose DRPruning, which incorporates distributionally robust optimization to restore balanced performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14055v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14055v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14055v1-abstract-full" style="display: none;"> Large language models (LLMs) deliver impressive results but face challenges from increasing model sizes and computational costs. Structured pruning reduces model size and speeds up inference but often causes uneven degradation across domains, leading to biased performance. To address this, we propose DRPruning, which incorporates distributionally robust optimization to restore balanced performance across domains, along with further improvements to enhance robustness. Experiments in monolingual and multilingual settings show that our method surpasses similarly sized models in pruning and continued pretraining over perplexity, downstream tasks, and instruction tuning. We further provide analysis demonstrating the robustness of our method towards various domains and distribution shifts. Furthermore, our method automatically determines optimal reference losses and data ratios, suggesting potential for broader applications. Our code is available at https://github.com/hexuandeng/DRPruning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14055v1-abstract-full').style.display = 'none'; document.getElementById('2411.14055v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03713">arXiv:2411.03713</a> <span> [<a href="https://arxiv.org/pdf/2411.03713">pdf</a>, <a href="https://arxiv.org/format/2411.03713">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Generalized Trusted Multi-view Classification Framework with Hierarchical Opinion Aggregation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shi%2C+L">Long Shi</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+C">Chuanqing Tang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huangyi Deng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Cai Xu</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+L">Lei Xing</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Badong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03713v1-abstract-short" style="display: inline;"> Recently, multi-view learning has witnessed a considerable interest on the research of trusted decision-making. Previous methods are mainly inspired from an important paper published by Han et al. in 2021, which formulates a Trusted Multi-view Classification (TMC) framework that aggregates evidence from different views based on Dempster's combination rule. All these methods only consider inter-vie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03713v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03713v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03713v1-abstract-full" style="display: none;"> Recently, multi-view learning has witnessed a considerable interest on the research of trusted decision-making. Previous methods are mainly inspired from an important paper published by Han et al. in 2021, which formulates a Trusted Multi-view Classification (TMC) framework that aggregates evidence from different views based on Dempster's combination rule. All these methods only consider inter-view aggregation, yet lacking exploitation of intra-view information. In this paper, we propose a generalized trusted multi-view classification framework with hierarchical opinion aggregation. This hierarchical framework includes a two-phase aggregation process: the intra-view and inter-view aggregation hierarchies. In the intra aggregation, we assume that each view is comprised of common information shared with other views, as well as its specific information. We then aggregate both the common and specific information. This aggregation phase is useful to eliminate the feature noise inherent to view itself, thereby improving the view quality. In the inter-view aggregation, we design an attention mechanism at the evidence level to facilitate opinion aggregation from different views. To the best of our knowledge, this is one of the pioneering efforts to formulate a hierarchical aggregation framework in the trusted multi-view learning domain. Extensive experiments show that our model outperforms some state-of-art trust-related baselines. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03713v1-abstract-full').style.display = 'none'; document.getElementById('2411.03713v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22985">arXiv:2410.22985</a> <span> [<a href="https://arxiv.org/pdf/2410.22985">pdf</a>, <a href="https://arxiv.org/ps/2410.22985">ps</a>, <a href="https://arxiv.org/format/2410.22985">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Troubling Taxonomies in GenAI Evaluation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Berman%2C+G">Glen Berman</a>, <a href="/search/cs?searchtype=author&query=Cooper%2C+N">Ned Cooper</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Hutchinson%2C+B">Ben Hutchinson</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22985v1-abstract-short" style="display: inline;"> To evaluate the societal impacts of GenAI requires a model of how social harms emerge from interactions between GenAI, people, and societal structures. Yet a model is rarely explicitly defined in societal impact evaluations, or in the taxonomies of societal impacts that support them. In this provocation, we argue that societal impacts should be conceptualised as application- and context-specific,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22985v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22985v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22985v1-abstract-full" style="display: none;"> To evaluate the societal impacts of GenAI requires a model of how social harms emerge from interactions between GenAI, people, and societal structures. Yet a model is rarely explicitly defined in societal impact evaluations, or in the taxonomies of societal impacts that support them. In this provocation, we argue that societal impacts should be conceptualised as application- and context-specific, incommensurable, and shaped by questions of social power. Doing so leads us to conclude that societal impact evaluations using existing taxonomies are inherently limited, in terms of their potential to reveal how GenAI systems may interact with people when introduced into specific social contexts. We therefore propose a governance-first approach to managing societal harms attended by GenAI technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22985v1-abstract-full').style.display = 'none'; document.getElementById('2410.22985v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">3 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20814">arXiv:2410.20814</a> <span> [<a href="https://arxiv.org/pdf/2410.20814">pdf</a>, <a href="https://arxiv.org/format/2410.20814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> NewTerm: Benchmarking Real-Time New Terms for Large Language Models with Annual Updates </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hexuan Deng</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+W">Wenxiang Jiao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xuebo Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Min Zhang</a>, <a href="/search/cs?searchtype=author&query=Tu%2C+Z">Zhaopeng Tu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20814v1-abstract-short" style="display: inline;"> Despite their remarkable abilities in various tasks, large language models (LLMs) still struggle with real-time information (e.g., new facts and terms) due to the knowledge cutoff in their development process. However, existing benchmarks focus on outdated content and limited fields, facing difficulties in real-time updating and leaving new terms unexplored. To address this problem, we propose an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20814v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20814v1-abstract-full" style="display: none;"> Despite their remarkable abilities in various tasks, large language models (LLMs) still struggle with real-time information (e.g., new facts and terms) due to the knowledge cutoff in their development process. However, existing benchmarks focus on outdated content and limited fields, facing difficulties in real-time updating and leaving new terms unexplored. To address this problem, we propose an adaptive benchmark, NewTerm, for real-time evaluation of new terms. We design a highly automated construction method to ensure high-quality benchmark construction with minimal human effort, allowing flexible updates for real-time information. Empirical results on various LLMs demonstrate over 20% performance reduction caused by new terms. Additionally, while updates to the knowledge cutoff of LLMs can cover some of the new terms, they are unable to generalize to more distant new terms. We also analyze which types of terms are more challenging and why LLMs struggle with new terms, paving the way for future research. Finally, we construct NewTerm 2022 and 2023 to evaluate the new terms updated each year and will continue updating annually. The benchmark and codes can be found at https://github.com/hexuandeng/NewTerm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20814v1-abstract-full').style.display = 'none'; document.getElementById('2410.20814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024 Datasets and Benchmarks Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11997">arXiv:2410.11997</a> <span> [<a href="https://arxiv.org/pdf/2410.11997">pdf</a>, <a href="https://arxiv.org/format/2410.11997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Finance">q-fin.CP</span> </div> </div> <p class="title is-5 mathjax"> Quantum Computing for Multi Period Asset Allocation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Queenie Sun</a>, <a href="/search/cs?searchtype=author&query=Grablevsky%2C+N">Nicholas Grablevsky</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huaizhang Deng</a>, <a href="/search/cs?searchtype=author&query=Azadi%2C+P">Pooya Azadi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11997v1-abstract-short" style="display: inline;"> Portfolio construction has been a long-standing topic of research in finance. The computational complexity and the time taken both increase rapidly with the number of investments in the portfolio. It becomes difficult, even impossible for classic computers to solve. Quantum computing is a new way of computing which takes advantage of quantum superposition and entanglement. It changes how such prob… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11997v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11997v1-abstract-full" style="display: none;"> Portfolio construction has been a long-standing topic of research in finance. The computational complexity and the time taken both increase rapidly with the number of investments in the portfolio. It becomes difficult, even impossible for classic computers to solve. Quantum computing is a new way of computing which takes advantage of quantum superposition and entanglement. It changes how such problems are approached and is not constrained by some of the classic computational complexity. Studies have shown that quantum computing can offer significant advantages over classical computing in many fields. The application of quantum computing has been constrained by the unavailability of actual quantum computers. In the past decade, there has been the rapid development of the large-scale quantum computer. However, software development for quantum computing is slow in many fields. In our study, we apply quantum computing to a multi-asset portfolio simulation. The simulation is based on historic data, covariance, and expected returns, all calculated using quantum computing. Although technically a solvable problem for classical computing, we believe the software development is important to the future application of quantum computing in finance. We conducted this study through simulation of a quantum computer and the use of Rensselaer Polytechnic Institute's IBM quantum computer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11997v1-abstract-full').style.display = 'none'; document.getElementById('2410.11997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11046">arXiv:2410.11046</a> <span> [<a href="https://arxiv.org/pdf/2410.11046">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> </div> </div> <p class="title is-5 mathjax"> SGUQ: Staged Graph Convolution Neural Network for Alzheimer's Disease Diagnosis using Multi-Omics Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tao%2C+L">Liang Tao</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yixin Xie</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J+D">Jeffrey D Deng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hong-Wen Deng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weihua Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11046v1-abstract-short" style="display: inline;"> Alzheimer's disease (AD) is a chronic neurodegenerative disorder and the leading cause of dementia, significantly impacting cost, mortality, and burden worldwide. The advent of high-throughput omics technologies, such as genomics, transcriptomics, proteomics, and epigenomics, has revolutionized the molecular understanding of AD. Conventional AI approaches typically require the completion of all om… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11046v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11046v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11046v1-abstract-full" style="display: none;"> Alzheimer's disease (AD) is a chronic neurodegenerative disorder and the leading cause of dementia, significantly impacting cost, mortality, and burden worldwide. The advent of high-throughput omics technologies, such as genomics, transcriptomics, proteomics, and epigenomics, has revolutionized the molecular understanding of AD. Conventional AI approaches typically require the completion of all omics data at the outset to achieve optimal AD diagnosis, which are inefficient and may be unnecessary. To reduce the clinical cost and improve the accuracy of AD diagnosis using multi-omics data, we propose a novel staged graph convolutional network with uncertainty quantification (SGUQ). SGUQ begins with mRNA and progressively incorporates DNA methylation and miRNA data only when necessary, reducing overall costs and exposure to harmful tests. Experimental results indicate that 46.23% of the samples can be reliably predicted using only single-modal omics data (mRNA), while an additional 16.04% of the samples can achieve reliable predictions when combining two omics data types (mRNA + DNA methylation). In addition, the proposed staged SGUQ achieved an accuracy of 0.858 on ROSMAP dataset, which outperformed existing methods significantly. The proposed SGUQ can not only be applied to AD diagnosis using multi-omics data but also has the potential for clinical decision-making using multi-viewed data. Our implementation is publicly available at https://github.com/chenzhao2023/multiomicsuncertainty. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11046v1-abstract-full').style.display = 'none'; document.getElementById('2410.11046v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 2 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10855">arXiv:2410.10855</a> <span> [<a href="https://arxiv.org/pdf/2410.10855">pdf</a>, <a href="https://arxiv.org/format/2410.10855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CogDevelop2K: Reversed Cognitive Development in Multimodal Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingying Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoran Sun</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Haiyun Lyu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dezhi Luo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10855v2-abstract-short" style="display: inline;"> Are Multi-modal Large Language Models (MLLMs) stochastic parrots? Do they genuinely understand? This paper aims to explore the core cognitive abilities that human intelligence builds upon to perceive, comprehend, and reason in MLLMs. To this end, we propose CogDevelop2K, a comprehensive benchmark that spans 12 sub-concepts from primitive knowledge like object permanence and boundary to more comple… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10855v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10855v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10855v2-abstract-full" style="display: none;"> Are Multi-modal Large Language Models (MLLMs) stochastic parrots? Do they genuinely understand? This paper aims to explore the core cognitive abilities that human intelligence builds upon to perceive, comprehend, and reason in MLLMs. To this end, we propose CogDevelop2K, a comprehensive benchmark that spans 12 sub-concepts from primitive knowledge like object permanence and boundary to more complex abilities like intentionality understanding, structured via the developmental trajectory of a human mind. We evaluate 46 MLLMs on our benchmarks. Surprisingly, we observe a reversed cognitive developmental trajectory compared to humans. Comprehensively, we further evaluate the influence of evaluation strategies and prompting techniques. Website with this $\href{https://growing-ai-like-a-child.github.io/}{link}$. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10855v2-abstract-full').style.display = 'none'; document.getElementById('2410.10855v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website with this $\href{https://growing-ai-like-a-child.github.io/}{link}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09453">arXiv:2410.09453</a> <span> [<a href="https://arxiv.org/pdf/2410.09453">pdf</a>, <a href="https://arxiv.org/format/2410.09453">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MMAD: The First-Ever Comprehensive Benchmark for Multimodal Large Language Models in Industrial Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xi Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jian Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hanqiu Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+B">Bin-Bin Gao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengjie Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+F">Feng Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09453v2-abstract-short" style="display: inline;"> In the field of industrial inspection, Multimodal Large Language Models (MLLMs) have a high potential to renew the paradigms in practical applications due to their robust language capabilities and generalization abilities. However, despite their impressive problem-solving skills in many domains, MLLMs' ability in industrial anomaly detection has not been systematically studied. To bridge this gap,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09453v2-abstract-full').style.display = 'inline'; document.getElementById('2410.09453v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09453v2-abstract-full" style="display: none;"> In the field of industrial inspection, Multimodal Large Language Models (MLLMs) have a high potential to renew the paradigms in practical applications due to their robust language capabilities and generalization abilities. However, despite their impressive problem-solving skills in many domains, MLLMs' ability in industrial anomaly detection has not been systematically studied. To bridge this gap, we present MMAD, the first-ever full-spectrum MLLMs benchmark in industrial Anomaly Detection. We defined seven key subtasks of MLLMs in industrial inspection and designed a novel pipeline to generate the MMAD dataset with 39,672 questions for 8,366 industrial images. With MMAD, we have conducted a comprehensive, quantitative evaluation of various state-of-the-art MLLMs. The commercial models performed the best, with the average accuracy of GPT-4o models reaching 74.9%. However, this result falls far short of industrial requirements. Our analysis reveals that current MLLMs still have significant room for improvement in answering questions related to industrial anomalies and defects. We further explore two training-free performance enhancement strategies to help models improve in industrial scenarios, highlighting their promising potential for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09453v2-abstract-full').style.display = 'none'; document.getElementById('2410.09453v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The code and data are available at https://github.com/jam-cc/MMAD</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06407">arXiv:2410.06407</a> <span> [<a href="https://arxiv.org/pdf/2410.06407">pdf</a>, <a href="https://arxiv.org/format/2410.06407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Methodology">stat.ME</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Skewness-Based Criterion for Addressing Heteroscedastic Noise in Causal Discovery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yingyu Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuxing Huang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+W">Wenqin Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoran Deng</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+I">Ignavier Ng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+K">Kun Zhang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingming Gong</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yi-An Ma</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Biwei Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06407v1-abstract-short" style="display: inline;"> Real-world data often violates the equal-variance assumption (homoscedasticity), making it essential to account for heteroscedastic noise in causal discovery. In this work, we explore heteroscedastic symmetric noise models (HSNMs), where the effect $Y$ is modeled as $Y = f(X) + 蟽(X)N$, with $X$ as the cause and $N$ as independent noise following a symmetric distribution. We introduce a novel crite… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06407v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06407v1-abstract-full" style="display: none;"> Real-world data often violates the equal-variance assumption (homoscedasticity), making it essential to account for heteroscedastic noise in causal discovery. In this work, we explore heteroscedastic symmetric noise models (HSNMs), where the effect $Y$ is modeled as $Y = f(X) + 蟽(X)N$, with $X$ as the cause and $N$ as independent noise following a symmetric distribution. We introduce a novel criterion for identifying HSNMs based on the skewness of the score (i.e., the gradient of the log density) of the data distribution. This criterion establishes a computationally tractable measurement that is zero in the causal direction but nonzero in the anticausal direction, enabling the causal direction discovery. We extend this skewness-based criterion to the multivariate setting and propose SkewScore, an algorithm that handles heteroscedastic noise without requiring the extraction of exogenous noise. We also conduct a case study on the robustness of SkewScore in a bivariate model with a latent confounder, providing theoretical insights into its performance. Empirical studies further validate the effectiveness of the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06407v1-abstract-full').style.display = 'none'; document.getElementById('2410.06407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03779">arXiv:2410.03779</a> <span> [<a href="https://arxiv.org/pdf/2410.03779">pdf</a>, <a href="https://arxiv.org/format/2410.03779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Discovering Message Passing Hierarchies for Mesh-Based Physics Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huayu Deng</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangming Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunbo Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03779v1-abstract-short" style="display: inline;"> Graph neural networks have emerged as a powerful tool for large-scale mesh-based physics simulation. Existing approaches primarily employ hierarchical, multi-scale message passing to capture long-range dependencies within the graph. However, these graph hierarchies are typically fixed and manually designed, which do not adapt to the evolving dynamics present in complex physical systems. In this pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03779v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03779v1-abstract-full" style="display: none;"> Graph neural networks have emerged as a powerful tool for large-scale mesh-based physics simulation. Existing approaches primarily employ hierarchical, multi-scale message passing to capture long-range dependencies within the graph. However, these graph hierarchies are typically fixed and manually designed, which do not adapt to the evolving dynamics present in complex physical systems. In this paper, we introduce a novel neural network named DHMP, which learns Dynamic Hierarchies for Message Passing networks through a differentiable node selection method. The key component is the anisotropic message passing mechanism, which operates at both intra-level and inter-level interactions. Unlike existing methods, it first supports directionally non-uniform aggregation of dynamic features between adjacent nodes within each graph hierarchy. Second, it determines node selection probabilities for the next hierarchy according to different physical contexts, thereby creating more flexible message shortcuts for learning remote node relations. Our experiments demonstrate the effectiveness of DHMP, achieving 22.7% improvement on average compared to recent fixed-hierarchy message passing networks across five classic physics simulation datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03779v1-abstract-full').style.display = 'none'; document.getElementById('2410.03779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00332">arXiv:2410.00332</a> <span> [<a href="https://arxiv.org/pdf/2410.00332">pdf</a>, <a href="https://arxiv.org/format/2410.00332">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Vision Language Models Know Law of Conservation without Understanding More-or-Less </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dezhi Luo</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Haiyun Lyu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingying Gao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoran Sun</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00332v3-abstract-short" style="display: inline;"> Conservation is a critical milestone of cognitive development considered to be supported by both the understanding of quantitative concepts and the reversibility of mental operations. To assess whether this critical component of human intelligence has emerged in Vision Language Models, we have curated the ConserveBench, a battery of 365 cognitive experiments across four dimensions of physical quan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00332v3-abstract-full').style.display = 'inline'; document.getElementById('2410.00332v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00332v3-abstract-full" style="display: none;"> Conservation is a critical milestone of cognitive development considered to be supported by both the understanding of quantitative concepts and the reversibility of mental operations. To assess whether this critical component of human intelligence has emerged in Vision Language Models, we have curated the ConserveBench, a battery of 365 cognitive experiments across four dimensions of physical quantities: volume, solid quantity, length, and number. The former two involve only transformational tasks, whereas the latter two involve non-transformational tasks assessing the understanding of quantitative concepts alone. Surprisingly, we find that while Vision Language Models are generally capable of conserving, they tend to fail at non-transformational tasks whose successes are typically considered to be evidence of the ability to conserve. This implies that the law of conservation, at least in concrete domains, may exist without corresponding conceptual understanding of quantity. $\href{https://growing-ai-like-a-child.github.io/pages/Conservation/}{Website}$ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00332v3-abstract-full').style.display = 'none'; document.getElementById('2410.00332v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">$\href{https://growing-ai-like-a-child.github.io/pages/Conservation/}{Website}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00324">arXiv:2410.00324</a> <span> [<a href="https://arxiv.org/pdf/2410.00324">pdf</a>, <a href="https://arxiv.org/format/2410.00324">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Vision Language Models See What You Want but not What You See </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingying Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Haiyun Lyu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoran Sun</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dezhi Luo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00324v4-abstract-short" style="display: inline;"> Knowing others' intentions and taking others' perspectives are two core components of human intelligence that are considered to be instantiations of theory-of-mind. Infiltrating machines with these abilities is an important step towards building human-level artificial intelligence. Here, to investigate intentionality understanding and level-2 perspective-taking in Vision Language Models (VLMs), we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00324v4-abstract-full').style.display = 'inline'; document.getElementById('2410.00324v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00324v4-abstract-full" style="display: none;"> Knowing others' intentions and taking others' perspectives are two core components of human intelligence that are considered to be instantiations of theory-of-mind. Infiltrating machines with these abilities is an important step towards building human-level artificial intelligence. Here, to investigate intentionality understanding and level-2 perspective-taking in Vision Language Models (VLMs), we constructed the IntentBench and PerspectBench, which together contains over 300 cognitive experiments grounded in real-world scenarios and classic cognitive tasks. We found VLMs achieving high performance on intentionality understanding but low performance on level-2 perspective-taking. This suggests a potential dissociation between simulation and theory-based theory-of-mind abilities in VLMs, highlighting the concern that they are not capable of using model-based reasoning to infer others' mental states. See $\href{https://growing-ai-like-a-child.github.io/pages/Three%20Mountain%20Task/}{Website}$ <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00324v4-abstract-full').style.display = 'none'; document.getElementById('2410.00324v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">$\href{https://growing-ai-like-a-child.github.io/pages/Three%20Mountain%20Task/}{Website}$</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00318">arXiv:2410.00318</a> <span> [<a href="https://arxiv.org/pdf/2410.00318">pdf</a>, <a href="https://arxiv.org/format/2410.00318">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> </div> </div> <p class="title is-5 mathjax"> Probing Mechanical Reasoning in Large Vision Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoran Sun</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qingying Gao</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+H">Haiyun Lyu</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+D">Dezhi Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yijiang Li</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00318v2-abstract-short" style="display: inline;"> Mechanical reasoning is a hallmark of human intelligence, defined by its ubiquitous yet irreplaceable role in human activities ranging from routine tasks to civil engineering. Embedding machines with mechanical reasoning is therefore an important step towards building human-level artificial intelligence. Here, we leveraged 155 cognitive experiments to test the understanding of system stability, ge… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00318v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00318v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00318v2-abstract-full" style="display: none;"> Mechanical reasoning is a hallmark of human intelligence, defined by its ubiquitous yet irreplaceable role in human activities ranging from routine tasks to civil engineering. Embedding machines with mechanical reasoning is therefore an important step towards building human-level artificial intelligence. Here, we leveraged 155 cognitive experiments to test the understanding of system stability, gears and pulley systems, leverage principle, inertia and motion, and fluid mechanics in 26 Vision Language Models (VLMs). Results indicate that VLMs consistently perform worse than humans on all domains, while demonstrate significant difficulty in reasoning about gear systems and fluid mechanics. Notably, their performance on these tasks do not improve as number of parameters increase, suggesting that current attention-based architecture may fail to grasp certain underlying mechanisms required for mechanical reasoning, particularly those pertaining to mental simulations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00318v2-abstract-full').style.display = 'none'; document.getElementById('2410.00318v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.20146">arXiv:2409.20146</a> <span> [<a href="https://arxiv.org/pdf/2409.20146">pdf</a>, <a href="https://arxiv.org/format/2409.20146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VMAD: Visual-enhanced Multimodal Large Language Model for Zero-Shot Anomaly Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huilin Deng</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hongchen Luo</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+W">Wei Zhai</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yang Cao</a>, <a href="/search/cs?searchtype=author&query=Kang%2C+Y">Yu Kang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.20146v1-abstract-short" style="display: inline;"> Zero-shot anomaly detection (ZSAD) recognizes and localizes anomalies in previously unseen objects by establishing feature mapping between textual prompts and inspection images, demonstrating excellent research value in flexible industrial manufacturing. However, existing ZSAD methods are limited by closed-world settings, struggling to unseen defects with predefined prompts. Recently, adapting Mul… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20146v1-abstract-full').style.display = 'inline'; document.getElementById('2409.20146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.20146v1-abstract-full" style="display: none;"> Zero-shot anomaly detection (ZSAD) recognizes and localizes anomalies in previously unseen objects by establishing feature mapping between textual prompts and inspection images, demonstrating excellent research value in flexible industrial manufacturing. However, existing ZSAD methods are limited by closed-world settings, struggling to unseen defects with predefined prompts. Recently, adapting Multimodal Large Language Models (MLLMs) for Industrial Anomaly Detection (IAD) presents a viable solution. Unlike fixed-prompt methods, MLLMs exhibit a generative paradigm with open-ended text interpretation, enabling more adaptive anomaly analysis. However, this adaption faces inherent challenges as anomalies often manifest in fine-grained regions and exhibit minimal visual discrepancies from normal samples. To address these challenges, we propose a novel framework VMAD (Visual-enhanced MLLM Anomaly Detection) that enhances MLLM with visual-based IAD knowledge and fine-grained perception, simultaneously providing precise detection and comprehensive analysis of anomalies. Specifically, we design a Defect-Sensitive Structure Learning scheme that transfers patch-similarities cues from visual branch to our MLLM for improved anomaly discrimination. Besides, we introduce a novel visual projector, Locality-enhanced Token Compression, which mines multi-level features in local contexts to enhance fine-grained detection. Furthermore, we introduce the Real Industrial Anomaly Detection (RIAD), a comprehensive IAD dataset with detailed anomaly descriptions and analyses, offering a valuable resource for MLLM-based IAD development. Extensive experiments on zero-shot benchmarks, including MVTec-AD, Visa, WFDD, and RIAD datasets, demonstrate our superior performance over state-of-the-art methods. The code and dataset will be available soon. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.20146v1-abstract-full').style.display = 'none'; document.getElementById('2409.20146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.18968">arXiv:2409.18968</a> <span> [<a href="https://arxiv.org/pdf/2409.18968">pdf</a>, <a href="https://arxiv.org/format/2409.18968">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Safety challenges of AI in medicine in the era of large language models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoye Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+N+X">Nicole Xi Zhang</a>, <a href="/search/cs?searchtype=author&query=He%2C+H">Hongyu He</a>, <a href="/search/cs?searchtype=author&query=Nguyen%2C+T">Trang Nguyen</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+K">Kun-Hsing Yu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hao Deng</a>, <a href="/search/cs?searchtype=author&query=Brandt%2C+C">Cynthia Brandt</a>, <a href="/search/cs?searchtype=author&query=Bitterman%2C+D+S">Danielle S. Bitterman</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+L">Ling Pan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+C">Ching-Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+J">James Zou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+D">Dianbo Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.18968v2-abstract-short" style="display: inline;"> Recent advancements in artificial intelligence (AI), particularly in large language models (LLMs), have unlocked significant potential to enhance the quality and efficiency of medical care. By introducing a novel way to interact with AI and data through natural language, LLMs offer new opportunities for medical practitioners, patients, and researchers. However, as AI and LLMs become more powerful… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18968v2-abstract-full').style.display = 'inline'; document.getElementById('2409.18968v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.18968v2-abstract-full" style="display: none;"> Recent advancements in artificial intelligence (AI), particularly in large language models (LLMs), have unlocked significant potential to enhance the quality and efficiency of medical care. By introducing a novel way to interact with AI and data through natural language, LLMs offer new opportunities for medical practitioners, patients, and researchers. However, as AI and LLMs become more powerful and especially achieve superhuman performance in some medical tasks, public concerns over their safety have intensified. These concerns about AI safety have emerged as the most significant obstacles to the adoption of AI in medicine. In response, this review examines emerging risks in AI utilization during the LLM era. First, we explore LLM-specific safety challenges from functional and communication perspectives, addressing issues across data collection, model training, and real-world application. We then consider inherent safety problems shared by all AI systems, along with additional complications introduced by LLMs. Last, we discussed how safety issues of using AI in clinical practice and healthcare system operation would undermine trust among patient, clinicians and the public, and how to build confidence in these systems. By emphasizing the development of safe AI, we believe these technologies can be more rapidly and reliably integrated into everyday medical practice to benefit both patients and clinicians. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.18968v2-abstract-full').style.display = 'none'; document.getElementById('2409.18968v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.09254">arXiv:2409.09254</a> <span> [<a href="https://arxiv.org/pdf/2409.09254">pdf</a>, <a href="https://arxiv.org/format/2409.09254">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> VSFormer: Mining Correlations in Flexible View Set for Multi-view 3D Shape Understanding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+H">Hongyu Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yongcai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peng Wang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoran Deng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xudong Cai</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Deying Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.09254v1-abstract-short" style="display: inline;"> View-based methods have demonstrated promising performance in 3D shape understanding. However, they tend to make strong assumptions about the relations between views or learn the multi-view correlations indirectly, which limits the flexibility of exploring inter-view correlations and the effectiveness of target tasks. To overcome the above problems, this paper investigates flexible organization an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09254v1-abstract-full').style.display = 'inline'; document.getElementById('2409.09254v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.09254v1-abstract-full" style="display: none;"> View-based methods have demonstrated promising performance in 3D shape understanding. However, they tend to make strong assumptions about the relations between views or learn the multi-view correlations indirectly, which limits the flexibility of exploring inter-view correlations and the effectiveness of target tasks. To overcome the above problems, this paper investigates flexible organization and explicit correlation learning for multiple views. In particular, we propose to incorporate different views of a 3D shape into a permutation-invariant set, referred to as \emph{View Set}, which removes rigid relation assumptions and facilitates adequate information exchange and fusion among views. Based on that, we devise a nimble Transformer model, named \emph{VSFormer}, to explicitly capture pairwise and higher-order correlations of all elements in the set. Meanwhile, we theoretically reveal a natural correspondence between the Cartesian product of a view set and the correlation matrix in the attention mechanism, which supports our model design. Comprehensive experiments suggest that VSFormer has better flexibility, efficient inference efficiency and superior performance. Notably, VSFormer reaches state-of-the-art results on various 3d recognition datasets, including ModelNet40, ScanObjectNN and RGBD. It also establishes new records on the SHREC'17 retrieval benchmark. The code and datasets are available at \url{https://github.com/auniquesun/VSFormer}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.09254v1-abstract-full').style.display = 'none'; document.getElementById('2409.09254v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted by TVCG 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01887">arXiv:2409.01887</a> <span> [<a href="https://arxiv.org/pdf/2409.01887">pdf</a>, <a href="https://arxiv.org/format/2409.01887">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Detecting and Measuring Security Implications of Entangled Domain Verification in CDN </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Ziyu Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zhiwei Lin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+R">Run Guo</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianjun Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Ximeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+T">Tianhao Yang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zhuoran Cao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+R+H">Robert H. Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01887v1-abstract-short" style="display: inline;"> Content Delivery Networks (CDNs) offer a protection layer for enhancing the security of websites. However, a significant security flaw named Absence of Domain Verification (DVA) has become emerging recently. Although this threat is recognized, the current practices and security flaws of domain verification strategies in CDNs have not been thoroughly investigated. In this paper, we present DVAHunte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01887v1-abstract-full').style.display = 'inline'; document.getElementById('2409.01887v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01887v1-abstract-full" style="display: none;"> Content Delivery Networks (CDNs) offer a protection layer for enhancing the security of websites. However, a significant security flaw named Absence of Domain Verification (DVA) has become emerging recently. Although this threat is recognized, the current practices and security flaws of domain verification strategies in CDNs have not been thoroughly investigated. In this paper, we present DVAHunter, an automated system for detecting DVA vulnerabilities that can lead to domain abuse in CDNs. Our evaluation of 45 major CDN providers reveals the prevalence of DVA: most (39/45) providers do not perform any verification, and even those that do remain exploitable. Additionally, we used DVAHunter to conduct a large-scale measurement of 89M subdomains from Tranco's Top 1M sites hosted on the 45 CDNs under evaluation. Our focus was on two primary DVA exploitation scenarios: covert communication and domain hijacking. We identified over 332K subdomains vulnerable to domain abuse. This tool provides deeper insights into DVA exploitation and allows us to propose viable mitigation practices for CDN providers. To date, we have received vulnerability confirmations from 12 providers; 6 (e.g., Edgio, Kuocai) have implemented fixes, and 1 (ChinaNetCenter) are actively working on solutions based on our recommendations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01887v1-abstract-full').style.display = 'none'; document.getElementById('2409.01887v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">18 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17182">arXiv:2408.17182</a> <span> [<a href="https://arxiv.org/pdf/2408.17182">pdf</a>, <a href="https://arxiv.org/format/2408.17182">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Classification-Regression Adaptive Loss for Dense Object Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yanquan Huang</a>, <a href="/search/cs?searchtype=author&query=Zhen%2C+L+W">Liu Wei Zhen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yun Hao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mengyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qingyao Wu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zikun Deng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xueming Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hong Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17182v1-abstract-short" style="display: inline;"> For object detection detectors, enhancing model performance hinges on the ability to simultaneously consider inconsistencies across tasks and focus on difficult-to-train samples. Achieving this necessitates incorporating information from both the classification and regression tasks. However, prior work tends to either emphasize difficult-to-train samples within their respective tasks or simply com… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17182v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17182v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17182v1-abstract-full" style="display: none;"> For object detection detectors, enhancing model performance hinges on the ability to simultaneously consider inconsistencies across tasks and focus on difficult-to-train samples. Achieving this necessitates incorporating information from both the classification and regression tasks. However, prior work tends to either emphasize difficult-to-train samples within their respective tasks or simply compute classification scores with IoU, often leading to suboptimal model performance. In this paper, we propose a Hybrid Classification-Regression Adaptive Loss, termed as HCRAL. Specifically, we introduce the Residual of Classification and IoU (RCI) module for cross-task supervision, addressing task inconsistencies, and the Conditioning Factor (CF) to focus on difficult-to-train samples within each task. Furthermore, we introduce a new strategy named Expanded Adaptive Training Sample Selection (EATSS) to provide additional samples that exhibit classification and regression inconsistencies. To validate the effectiveness of the proposed method, we conduct extensive experiments on COCO test-dev. Experimental evaluations demonstrate the superiority of our approachs. Additionally, we designed experiments by separately combining the classification and regression loss with regular loss functions in popular one-stage models, demonstrating improved performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17182v1-abstract-full').style.display = 'none'; document.getElementById('2408.17182v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13184">arXiv:2408.13184</a> <span> [<a href="https://arxiv.org/pdf/2408.13184">pdf</a>, <a href="https://arxiv.org/format/2408.13184">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Can LLM be a Good Path Planner based on Prompt Engineering? Mitigating the Hallucination for Path Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hourui Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongjie Zhang</a>, <a href="/search/cs?searchtype=author&query=Ou%2C+J">Jie Ou</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chaosheng Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13184v2-abstract-short" style="display: inline;"> Spatial reasoning in Large Language Models (LLMs) is the foundation for embodied intelligence. However, even in simple maze environments, LLMs still encounter challenges in long-term path-planning, primarily influenced by their spatial hallucination and context inconsistency hallucination by long-term reasoning. To address this challenge, this study proposes an innovative model, Spatial-to-Relatio… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13184v2-abstract-full').style.display = 'inline'; document.getElementById('2408.13184v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13184v2-abstract-full" style="display: none;"> Spatial reasoning in Large Language Models (LLMs) is the foundation for embodied intelligence. However, even in simple maze environments, LLMs still encounter challenges in long-term path-planning, primarily influenced by their spatial hallucination and context inconsistency hallucination by long-term reasoning. To address this challenge, this study proposes an innovative model, Spatial-to-Relational Transformation and Curriculum Q-Learning (S2RCQL). To address the spatial hallucination of LLMs, we propose the Spatial-to-Relational approach, which transforms spatial prompts into entity relations and paths representing entity relation chains. This approach fully taps the potential of LLMs in terms of sequential thinking. As a result, we design a path-planning algorithm based on Q-learning to mitigate the context inconsistency hallucination, which enhances the reasoning ability of LLMs. Using the Q-value of state-action as auxiliary information for prompts, we correct the hallucinations of LLMs, thereby guiding LLMs to learn the optimal path. Finally, we propose a reverse curriculum learning technique based on LLMs to further mitigate the context inconsistency hallucination. LLMs can rapidly accumulate successful experiences by reducing task difficulty and leveraging them to tackle more complex tasks. We performed comprehensive experiments based on Baidu's self-developed LLM: ERNIE-Bot 4.0. The results showed that our S2RCQL achieved a 23%--40% improvement in both success and optimality rates compared with advanced prompt engineering. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13184v2-abstract-full').style.display = 'none'; document.getElementById('2408.13184v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICASSP</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11609">arXiv:2408.11609</a> <span> [<a href="https://arxiv.org/pdf/2408.11609">pdf</a>, <a href="https://arxiv.org/format/2408.11609">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Xinyu: An Efficient LLM-based System for Commentary Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiquan Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bo Tang</a>, <a href="/search/cs?searchtype=author&query=Xi%2C+C">Chenyang Xi</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Y">Yu Yu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Pengyu Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yifei Liu</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haiying Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhiyu Li</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+F">Feiyu Xiong</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jie Hu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhonghao Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yi Luo</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingchuan Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11609v2-abstract-short" style="display: inline;"> Commentary provides readers with a deep understanding of events by presenting diverse arguments and evidence. However, creating commentary is a time-consuming task, even for skilled commentators. Large language models (LLMs) have simplified the process of natural language generation, but their direct application in commentary creation still faces challenges due to unique task requirements. These r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11609v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11609v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11609v2-abstract-full" style="display: none;"> Commentary provides readers with a deep understanding of events by presenting diverse arguments and evidence. However, creating commentary is a time-consuming task, even for skilled commentators. Large language models (LLMs) have simplified the process of natural language generation, but their direct application in commentary creation still faces challenges due to unique task requirements. These requirements can be categorized into two levels: 1) fundamental requirements, which include creating well-structured and logically consistent narratives, and 2) advanced requirements, which involve generating quality arguments and providing convincing evidence. In this paper, we introduce Xinyu, an efficient LLM-based system designed to assist commentators in generating Chinese commentaries. To meet the fundamental requirements, we deconstruct the generation process into sequential steps, proposing targeted strategies and supervised fine-tuning (SFT) for each step. To address the advanced requirements, we present an argument ranking model for arguments and establish a comprehensive evidence database that includes up-to-date events and classic books, thereby strengthening the substantiation of the evidence with retrieval augmented generation (RAG) technology. To evaluate the generated commentaries more fairly, corresponding to the two-level requirements, we introduce a comprehensive evaluation metric that considers five distinct perspectives in commentary generation. Our experiments confirm the effectiveness of our proposed system. We also observe a significant increase in the efficiency of commentators in real-world scenarios, with the average time spent on creating a commentary dropping from 4 hours to 20 minutes. Importantly, such an increase in efficiency does not compromise the quality of the commentaries. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11609v2-abstract-full').style.display = 'none'; document.getElementById('2408.11609v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.07468">arXiv:2408.07468</a> <span> [<a href="https://arxiv.org/pdf/2408.07468">pdf</a>, <a href="https://arxiv.org/format/2408.07468">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Exploring the Impact of Passthrough on VR Exergaming in Public Environments: A Field Study </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zixuan Guo</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hanxiao Deng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongyu Wang</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+A+J+Y">Angel J. Y. Tan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenge Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">Hai-Ning Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.07468v1-abstract-short" style="display: inline;"> Sedentary behavior is becoming increasingly prevalent in daily work and study environments. VR exergaming has emerged as a promising solution in these places of work and study. However, private spaces in these environments are not easy, and engaging in VR exergaming in public settings presents its own set of challenges (e.g., safety, social acceptance, isolation, and privacy protection). The recen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07468v1-abstract-full').style.display = 'inline'; document.getElementById('2408.07468v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.07468v1-abstract-full" style="display: none;"> Sedentary behavior is becoming increasingly prevalent in daily work and study environments. VR exergaming has emerged as a promising solution in these places of work and study. However, private spaces in these environments are not easy, and engaging in VR exergaming in public settings presents its own set of challenges (e.g., safety, social acceptance, isolation, and privacy protection). The recent development of Passthrough functionality in VR headsets allows users to maintain awareness of their surroundings, enhancing safety and convenience. Despite its potential benefits, little is known about how Passthrough could affect user performance and experience and solve the challenges of playing VR exergames in real-world public environments. To our knowledge, this work is the first to conduct a field study in an underground passageway on a university campus to explore the use of Passthrough in a real-world public environment, with a disturbance-free closed room as a baseline. Results indicate that enabling Passthrough in a public environment improves performance without compromising presence. Moreover, Passthrough can increase social acceptance, especially among individuals with higher levels of self-consciousness. These findings highlight Passthrough's potential to encourage VR exergaming adoption in public environments, with promising implications for overall health and well-being. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.07468v1-abstract-full').style.display = 'none'; document.getElementById('2408.07468v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.01057">arXiv:2408.01057</a> <span> [<a href="https://arxiv.org/pdf/2408.01057">pdf</a>, <a href="https://arxiv.org/format/2408.01057">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3711076">10.1145/3711076 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Supporting Industry Computing Researchers in Assessing, Articulating, and Addressing the Potential Negative Societal Impact of Their Work </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+W+H">Wesley Hanwen Deng</a>, <a href="/search/cs?searchtype=author&query=Barocas%2C+S">Solon Barocas</a>, <a href="/search/cs?searchtype=author&query=Vaughan%2C+J+W">Jennifer Wortman Vaughan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.01057v3-abstract-short" style="display: inline;"> Recent years have witnessed increasing calls for computing researchers to grapple with the societal impacts of their work. Tools such as impact assessments have gained prominence as a method to uncover potential impacts, and a number of publication venues now encourage authors to include an impact statement in their submissions. Despite this push, little is known about the way researchers assess,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01057v3-abstract-full').style.display = 'inline'; document.getElementById('2408.01057v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.01057v3-abstract-full" style="display: none;"> Recent years have witnessed increasing calls for computing researchers to grapple with the societal impacts of their work. Tools such as impact assessments have gained prominence as a method to uncover potential impacts, and a number of publication venues now encourage authors to include an impact statement in their submissions. Despite this push, little is known about the way researchers assess, articulate, and address the potential negative societal impact of their work -- especially in industry settings, where research outcomes are often quickly integrated into products. In addition, while there are nascent efforts to support researchers in this task, there remains a dearth of empirically-informed tools and processes. Through interviews with 25 industry computing researchers across different companies and research areas, we first identify four key factors that influence how they grapple with (or choose not to grapple with) the societal impact of their research. To develop an effective impact assessment template tailored to industry computing researchers' needs, we conduct an iterative co-design process with these 25 industry researchers and an additional 16 researchers and practitioners with prior experience and expertise in reviewing and developing impact assessments or broad responsible computing practices. Through the co-design process, we develop 10 design considerations to facilitate the effective design, development, and adaptation of an impact assessment template for use in industry research settings and beyond, as well as our own ``Societal Impact Assessment'' template with concrete scaffolds. We explore the effectiveness of this template through a user study with 15 industry research interns, revealing both its strengths and limitations. Finally, we discuss the implications for future researchers and organizations seeking to foster more responsible research practices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.01057v3-abstract-full').style.display = 'none'; document.getElementById('2408.01057v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proc. ACM Hum.-Comput. Interact. 9, 2, Article CSCW 2025 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.00855">arXiv:2408.00855</a> <span> [<a href="https://arxiv.org/pdf/2408.00855">pdf</a>, <a href="https://arxiv.org/format/2408.00855">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3678518">10.1145/3678518 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> HAIGEN: Towards Human-AI Collaboration for Facilitating Creativity and Style Generation in Fashion Design </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jianan Jiang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+D">Di Wu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hanhui Deng</a>, <a href="/search/cs?searchtype=author&query=Long%2C+Y">Yidan Long</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+W">Wenyi Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Can Liu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhanpeng Jin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenlei Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+T">Tangquan Qi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.00855v3-abstract-short" style="display: inline;"> The process of fashion design usually involves sketching, refining, and coloring, with designers drawing inspiration from various images to fuel their creative endeavors. However, conventional image search methods often yield irrelevant results, impeding the design process. Moreover, creating and coloring sketches can be time-consuming and demanding, acting as a bottleneck in the design workflow.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00855v3-abstract-full').style.display = 'inline'; document.getElementById('2408.00855v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.00855v3-abstract-full" style="display: none;"> The process of fashion design usually involves sketching, refining, and coloring, with designers drawing inspiration from various images to fuel their creative endeavors. However, conventional image search methods often yield irrelevant results, impeding the design process. Moreover, creating and coloring sketches can be time-consuming and demanding, acting as a bottleneck in the design workflow. In this work, we introduce HAIGEN (Human-AI Collaboration for GENeration), an efficient fashion design system for Human-AI collaboration developed to aid designers. Specifically, HAIGEN consists of four modules. T2IM, located in the cloud, generates reference inspiration images directly from text prompts. With three other modules situated locally, the I2SM batch generates the image material library into a certain designer-style sketch material library. The SRM recommends similar sketches in the generated library to designers for further refinement, and the STM colors the refined sketch according to the styles of inspiration images. Through our system, any designer can perform local personalized fine-tuning and leverage the powerful generation capabilities of large models in the cloud, streamlining the entire design development process. Given that our approach integrates both cloud and local model deployment schemes, it effectively safeguards design privacy by avoiding the need to upload personalized data from local designers. We validated the effectiveness of each module through extensive qualitative and quantitative experiments. User surveys also confirmed that HAIGEN offers significant advantages in design efficiency, positioning it as a new generation of aid-tool for designers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.00855v3-abstract-full').style.display = 'none'; document.getElementById('2408.00855v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies (ACM IMWUT/UbiComp 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17793">arXiv:2407.17793</a> <span> [<a href="https://arxiv.org/pdf/2407.17793">pdf</a>, <a href="https://arxiv.org/format/2407.17793">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Neurons and Cognition">q-bio.NC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> </div> </div> <p class="title is-5 mathjax"> Use-dependent Biases as Optimal Action under Information Bottleneck </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hokin Deng</a>, <a href="/search/cs?searchtype=author&query=Haith%2C+A">Adrian Haith</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17793v4-abstract-short" style="display: inline;"> Use-dependent bias is a phenomenon in human sensorimotor behavior whereby movements become biased towards previously repeated actions. Despite being well-documented, the reason why this phenomenon occurs is not yet clearly understood. Here, we propose that use-dependent biases can be understood as a rational strategy for movement under limitations on the capacity to process sensory information to… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17793v4-abstract-full').style.display = 'inline'; document.getElementById('2407.17793v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17793v4-abstract-full" style="display: none;"> Use-dependent bias is a phenomenon in human sensorimotor behavior whereby movements become biased towards previously repeated actions. Despite being well-documented, the reason why this phenomenon occurs is not yet clearly understood. Here, we propose that use-dependent biases can be understood as a rational strategy for movement under limitations on the capacity to process sensory information to guide motor output. We adopt an information-theoretic approach to characterize sensorimotor information processing and determine how behavior should be optimized given limitations to this capacity. We show that this theory naturally predicts the existence of use-dependent biases. Our framework also generates two further predictions. The first prediction relates to handedness. The dominant hand is associated with enhanced dexterity and reduced movement variability compared to the non-dominant hand, which we propose relates to a greater capacity for information processing in regions that control movement of the dominant hand. Consequently, the dominant hand should exhibit smaller use-dependent biases compared to the non-dominant hand. The second prediction relates to how use-dependent biases are affected by movement speed. When moving faster, it is more challenging to correct for initial movement errors online during the movement. This should exacerbate costs associated with initial directional error and, according to our theory, reduce the extent of use-dependent biases compared to slower movements, and vice versa. We show that these two empirical predictions, the handedness effect and the speed-dependent effect, are confirmed by experimental data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17793v4-abstract-full').style.display = 'none'; document.getElementById('2407.17793v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.04359">arXiv:2407.04359</a> <span> [<a href="https://arxiv.org/pdf/2407.04359">pdf</a>, <a href="https://arxiv.org/format/2407.04359">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Dance of the ADS: Orchestrating Failures through Historically-Informed Scenario Fuzzing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+T">Tong Wang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+T">Taotao Gu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huan Deng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hu Li</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+X">Xiaohui Kuang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+G">Gang Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.04359v1-abstract-short" style="display: inline;"> As autonomous driving systems (ADS) advance towards higher levels of autonomy, orchestrating their safety verification becomes increasingly intricate. This paper unveils ScenarioFuzz, a pioneering scenario-based fuzz testing methodology. Designed like a choreographer who understands the past performances, it uncovers vulnerabilities in ADS without the crutch of predefined scenarios. Leveraging map… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04359v1-abstract-full').style.display = 'inline'; document.getElementById('2407.04359v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.04359v1-abstract-full" style="display: none;"> As autonomous driving systems (ADS) advance towards higher levels of autonomy, orchestrating their safety verification becomes increasingly intricate. This paper unveils ScenarioFuzz, a pioneering scenario-based fuzz testing methodology. Designed like a choreographer who understands the past performances, it uncovers vulnerabilities in ADS without the crutch of predefined scenarios. Leveraging map road networks, such as OPENDRIVE, we extract essential data to form a foundational scenario seed corpus. This corpus, enriched with pertinent information, provides the necessary boundaries for fuzz testing in the absence of starting scenarios. Our approach integrates specialized mutators and mutation techniques, combined with a graph neural network model, to predict and filter out high-risk scenario seeds, optimizing the fuzzing process using historical test data. Compared to other methods, our approach reduces the time cost by an average of 60.3%, while the number of error scenarios discovered per unit of time increases by 103%. Furthermore, we propose a self-supervised collision trajectory clustering method, which aids in identifying and summarizing 54 high-risk scenario categories prone to inducing ADS faults. Our experiments have successfully uncovered 58 bugs across six tested systems, emphasizing the critical safety concerns of ADS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.04359v1-abstract-full').style.display = 'none'; document.getElementById('2407.04359v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper was accepted by 33rd ACM SIGSOFT International Symposium on Software Testing and Analysis (ISSTA 2024)</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68Txx (Primary) <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> D.2.4; I.2.9; I.6.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01601">arXiv:2407.01601</a> <span> [<a href="https://arxiv.org/pdf/2407.01601">pdf</a>, <a href="https://arxiv.org/format/2407.01601">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling and Controlling Anomalous Attention Distribution in Transformers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yan%2C+R">Ruiqing Yan</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xingbo Du</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haoyu Deng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+L">Linghan Zheng</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Q">Qiuzhuang Sun</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jifang Hu</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yuhang Shao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+P">Penghao Jiang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinrong Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lian Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01601v2-abstract-short" style="display: inline;"> With the advent of large models based on the Transformer architecture, researchers have observed an anomalous phenomenon in the Attention mechanism--there is a very high attention on the first element, which is prevalent across Transformer-based models. It is crucial to understand it for the development of techniques focusing on attention distribution, such as Key-Value (KV) Cache compression and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01601v2-abstract-full').style.display = 'inline'; document.getElementById('2407.01601v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01601v2-abstract-full" style="display: none;"> With the advent of large models based on the Transformer architecture, researchers have observed an anomalous phenomenon in the Attention mechanism--there is a very high attention on the first element, which is prevalent across Transformer-based models. It is crucial to understand it for the development of techniques focusing on attention distribution, such as Key-Value (KV) Cache compression and infinite extrapolation; however, the latent cause leaves to be unknown. In this paper, we analyze such a phenomenon from the perspective of waiver phenomenon, which involves reducing the internal values of certain elements in the sequence, allowing them to absorb excess attention without affecting their contribution to information. In specific models, due to differences in positional encoding and attention patterns, we have found that the selection of waiver elements by the model can be categorized into two methods: positional-encoding-based and feature-distribution-within-elements-based. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01601v2-abstract-full').style.display = 'none'; document.getElementById('2407.01601v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 26 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.01050">arXiv:2407.01050</a> <span> [<a href="https://arxiv.org/pdf/2407.01050">pdf</a>, <a href="https://arxiv.org/format/2407.01050">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evolutionary Morphology Towards Overconstrained Locomotion via Large-Scale, Multi-Terrain Deep Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yenan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuye Zhang</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+P">Pengxi Gu</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+J">Jianuo Qiu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+J">Jiayi Yin</a>, <a href="/search/cs?searchtype=author&query=Qiu%2C+N">Nuofan Qiu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+G">Guojing Huang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+B">Bangchao Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zishang Zhang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hui Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+F">Fang Wan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+C">Chaoyang Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.01050v1-abstract-short" style="display: inline;"> While the animals' Fin-to-Limb evolution has been well-researched in biology, such morphological transformation remains under-adopted in the modern design of advanced robotic limbs. This paper investigates a novel class of overconstrained locomotion from a design and learning perspective inspired by evolutionary morphology, aiming to integrate the concept of `intelligent design under constraints'… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01050v1-abstract-full').style.display = 'inline'; document.getElementById('2407.01050v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.01050v1-abstract-full" style="display: none;"> While the animals' Fin-to-Limb evolution has been well-researched in biology, such morphological transformation remains under-adopted in the modern design of advanced robotic limbs. This paper investigates a novel class of overconstrained locomotion from a design and learning perspective inspired by evolutionary morphology, aiming to integrate the concept of `intelligent design under constraints' - hereafter referred to as constraint-driven design intelligence - in developing modern robotic limbs with superior energy efficiency. We propose a 3D-printable design of robotic limbs parametrically reconfigurable as a classical planar 4-bar linkage, an overconstrained Bennett linkage, and a spherical 4-bar linkage. These limbs adopt a co-axial actuation, identical to the modern legged robot platforms, with the added capability of upgrading into a wheel-legged system. Then, we implemented a large-scale, multi-terrain deep reinforcement learning framework to train these reconfigurable limbs for a comparative analysis of overconstrained locomotion in energy efficiency. Results show that the overconstrained limbs exhibit more efficient locomotion than planar limbs during forward and sideways walking over different terrains, including floors, slopes, and stairs, with or without random noises, by saving at least 22% mechanical energy in completing the traverse task, with the spherical limbs being the least efficient. It also achieves the highest average speed of 0.85 meters per second on flat terrain, which is 20% faster than the planar limbs. This study paves the path for an exciting direction for future research in overconstrained robotics leveraging evolutionary morphology and reconfigurable mechanism intelligence when combined with state-of-the-art methods in deep reinforcement learning. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.01050v1-abstract-full').style.display = 'none'; document.getElementById('2407.01050v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 5 figures, Accepted and Presented at ReMAR2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.18548">arXiv:2406.18548</a> <span> [<a href="https://arxiv.org/pdf/2406.18548">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Exploration of Multi-Scale Image Fusion Systems in Intelligent Medical Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+H">Haowei Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Ting Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+S">Shuyao He</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiajie Yuan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haozhang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.18548v1-abstract-short" style="display: inline;"> The diagnosis of brain cancer relies heavily on medical imaging techniques, with MRI being the most commonly used. It is necessary to perform automatic segmentation of brain tumors on MRI images. This project intends to build an MRI algorithm based on U-Net. The residual network and the module used to enhance the context information are combined, and the void space convolution pooling pyramid is a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18548v1-abstract-full').style.display = 'inline'; document.getElementById('2406.18548v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.18548v1-abstract-full" style="display: none;"> The diagnosis of brain cancer relies heavily on medical imaging techniques, with MRI being the most commonly used. It is necessary to perform automatic segmentation of brain tumors on MRI images. This project intends to build an MRI algorithm based on U-Net. The residual network and the module used to enhance the context information are combined, and the void space convolution pooling pyramid is added to the network for processing. The brain glioma MRI image dataset provided by cancer imaging archives was experimentally verified. A multi-scale segmentation method based on a weighted least squares filter was used to complete the 3D reconstruction of brain tumors. Thus, the accuracy of three-dimensional reconstruction is further improved. Experiments show that the local texture features obtained by the proposed algorithm are similar to those obtained by laser scanning. The algorithm is improved by using the U-Net method and an accuracy of 0.9851 is obtained. This approach significantly enhances the precision of image segmentation and boosts the efficiency of image classification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.18548v1-abstract-full').style.display = 'none'; document.getElementById('2406.18548v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.17538">arXiv:2406.17538</a> <span> [<a href="https://arxiv.org/pdf/2406.17538">pdf</a>, <a href="https://arxiv.org/format/2406.17538">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Three-Stream Temporal-Shift Attention Network Based on Self-Knowledge Distillation for Micro-Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+G">Guanghao Zhu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lin Liu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuhao Hu</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haixin Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F">Fang Liu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaohui Du</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+R">Ruqian Hao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Juanxiu Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yong Liu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hao Deng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jing Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.17538v2-abstract-short" style="display: inline;"> Micro-expressions are subtle facial movements that occur spontaneously when people try to conceal real emotions. Micro-expression recognition is crucial in many fields, including criminal analysis and psychotherapy. However, micro-expression recognition is challenging since micro-expressions have low intensity and public datasets are small in size. To this end, a three-stream temporal-shift attent… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17538v2-abstract-full').style.display = 'inline'; document.getElementById('2406.17538v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.17538v2-abstract-full" style="display: none;"> Micro-expressions are subtle facial movements that occur spontaneously when people try to conceal real emotions. Micro-expression recognition is crucial in many fields, including criminal analysis and psychotherapy. However, micro-expression recognition is challenging since micro-expressions have low intensity and public datasets are small in size. To this end, a three-stream temporal-shift attention network based on self-knowledge distillation called SKD-TSTSAN is proposed in this paper. Firstly, to address the low intensity of muscle movements, we utilize learning-based motion magnification modules to enhance the intensity of muscle movements. Secondly, we employ efficient channel attention modules in the local-spatial stream to make the network focus on facial regions that are highly relevant to micro-expressions. In addition, temporal shift modules are used in the dynamic-temporal stream, which enables temporal modeling with no additional parameters by mixing motion information from two different temporal domains. Furthermore, we introduce self-knowledge distillation into the micro-expression recognition task by introducing auxiliary classifiers and using the deepest section of the network for supervision, encouraging all blocks to fully explore the features of the training set. Finally, extensive experiments are conducted on four public datasets: CASME II, SAMM, MMEW, and CAS(ME)3. The experimental results demonstrate that our SKD-TSTSAN outperforms other existing methods and achieves new state-of-the-art performance. Our code will be available at https://github.com/GuanghaoZhu663/SKD-TSTSAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.17538v2-abstract-full').style.display = 'none'; document.getElementById('2406.17538v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 25 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.12769">arXiv:2406.12769</a> <span> [<a href="https://arxiv.org/pdf/2406.12769">pdf</a>, <a href="https://arxiv.org/format/2406.12769">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Latent Intuitive Physics: Learning to Transfer Hidden Physics from A 3D Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiangming Zhu</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Huayu Deng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+H">Haochen Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yunbo Wang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xiaokang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.12769v1-abstract-short" style="display: inline;"> We introduce latent intuitive physics, a transfer learning framework for physics simulation that can infer hidden properties of fluids from a single 3D video and simulate the observed fluid in novel scenes. Our key insight is to use latent features drawn from a learnable prior distribution conditioned on the underlying particle states to capture the invisible and complex physical properties. To ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12769v1-abstract-full').style.display = 'inline'; document.getElementById('2406.12769v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.12769v1-abstract-full" style="display: none;"> We introduce latent intuitive physics, a transfer learning framework for physics simulation that can infer hidden properties of fluids from a single 3D video and simulate the observed fluid in novel scenes. Our key insight is to use latent features drawn from a learnable prior distribution conditioned on the underlying particle states to capture the invisible and complex physical properties. To achieve this, we train a parametrized prior learner given visual observations to approximate the visual posterior of inverse graphics, and both the particle states and the visual posterior are obtained from a learned neural renderer. The converged prior learner is embedded in our probabilistic physics engine, allowing us to perform novel simulations on unseen geometries, boundaries, and dynamics without knowledge of the true physical parameters. We validate our model in three ways: (i) novel scene simulation with the learned visual-world physics, (ii) future prediction of the observed fluid dynamics, and (iii) supervised particle simulation. Our model demonstrates strong performance in all three tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.12769v1-abstract-full').style.display = 'none'; document.getElementById('2406.12769v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published as a conference paper at ICLR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.08864">arXiv:2406.08864</a> <span> [<a href="https://arxiv.org/pdf/2406.08864">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Research on Early Warning Model of Cardiovascular Disease Based on Computer Deep Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+Y">Yuxiang Hu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinxin Hu</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+T">Ting Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Bo Zhang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+J">Jiajie Yuan</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haozhang Deng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.08864v1-abstract-short" style="display: inline;"> This project intends to study a cardiovascular disease risk early warning model based on one-dimensional convolutional neural networks. First, the missing values of 13 physiological and symptom indicators such as patient age, blood glucose, cholesterol, and chest pain were filled and Z-score was standardized. The convolutional neural network is converted into a 2D matrix, the convolution function… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08864v1-abstract-full').style.display = 'inline'; document.getElementById('2406.08864v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.08864v1-abstract-full" style="display: none;"> This project intends to study a cardiovascular disease risk early warning model based on one-dimensional convolutional neural networks. First, the missing values of 13 physiological and symptom indicators such as patient age, blood glucose, cholesterol, and chest pain were filled and Z-score was standardized. The convolutional neural network is converted into a 2D matrix, the convolution function of 1,3, and 5 is used for the first-order convolution operation, and the Max Pooling algorithm is adopted for dimension reduction. Set the learning rate and output rate. It is optimized by the Adam algorithm. The result of classification is output by a soft classifier. This study was conducted based on Statlog in the UCI database and heart disease database respectively. The empirical data indicate that the forecasting precision of this technique has been enhanced by 11.2%, relative to conventional approaches, while there is a significant improvement in the logarithmic curve fitting. The efficacy and applicability of the novel approach are corroborated through the examination employing a one-dimensional convolutional neural network. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.08864v1-abstract-full').style.display = 'none'; document.getElementById('2406.08864v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.20071">arXiv:2405.20071</a> <span> [<a href="https://arxiv.org/pdf/2405.20071">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Staged Approach using Machine Learning and Uncertainty Quantification to Predict the Risk of Hip Fracture </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shaik%2C+A">Anjum Shaik</a>, <a href="/search/cs?searchtype=author&query=Larsen%2C+K">Kristoffer Larsen</a>, <a href="/search/cs?searchtype=author&query=Lane%2C+N+E">Nancy E. Lane</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chen Zhao</a>, <a href="/search/cs?searchtype=author&query=Su%2C+K">Kuan-Jui Su</a>, <a href="/search/cs?searchtype=author&query=Keyak%2C+J+H">Joyce H. Keyak</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Q">Qing Tian</a>, <a href="/search/cs?searchtype=author&query=Sha%2C+Q">Qiuying Sha</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hui Shen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Hong-Wen Deng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+W">Weihua Zhou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.20071v1-abstract-short" style="display: inline;"> Despite advancements in medical care, hip fractures impose a significant burden on individuals and healthcare systems. This paper focuses on the prediction of hip fracture risk in older and middle-aged adults, where falls and compromised bone quality are predominant factors. We propose a novel staged model that combines advanced imaging and clinical data to improve predictive performance. By using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20071v1-abstract-full').style.display = 'inline'; document.getElementById('2405.20071v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.20071v1-abstract-full" style="display: none;"> Despite advancements in medical care, hip fractures impose a significant burden on individuals and healthcare systems. This paper focuses on the prediction of hip fracture risk in older and middle-aged adults, where falls and compromised bone quality are predominant factors. We propose a novel staged model that combines advanced imaging and clinical data to improve predictive performance. By using CNNs to extract features from hip DXA images, along with clinical variables, shape measurements, and texture features, our method provides a comprehensive framework for assessing fracture risk. A staged machine learning-based model was developed using two ensemble models: Ensemble 1 (clinical variables only) and Ensemble 2 (clinical variables and DXA imaging features). This staged approach used uncertainty quantification from Ensemble 1 to decide if DXA features are necessary for further prediction. Ensemble 2 exhibited the highest performance, achieving an AUC of 0.9541, an accuracy of 0.9195, a sensitivity of 0.8078, and a specificity of 0.9427. The staged model also performed well, with an AUC of 0.8486, an accuracy of 0.8611, a sensitivity of 0.5578, and a specificity of 0.9249, outperforming Ensemble 1, which had an AUC of 0.5549, an accuracy of 0.7239, a sensitivity of 0.1956, and a specificity of 0.8343. Furthermore, the staged model suggested that 54.49% of patients did not require DXA scanning. It effectively balanced accuracy and specificity, offering a robust solution when DXA data acquisition is not always feasible. Statistical tests confirmed significant differences between the models, highlighting the advantages of the advanced modeling strategies. Our staged approach could identify individuals at risk with a high accuracy but reduce the unnecessary DXA scanning. It has great promise to guide interventions to prevent hip fractures with reduced cost and radiation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.20071v1-abstract-full').style.display = 'none'; document.getElementById('2405.20071v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">29 pages, 5 figures, 6 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.10691">arXiv:2405.10691</a> <span> [<a href="https://arxiv.org/pdf/2405.10691">pdf</a>, <a href="https://arxiv.org/format/2405.10691">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LoCI-DiffCom: Longitudinal Consistency-Informed Diffusion Model for 3D Infant Brain Image Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zihao Zhu</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+T">Tianli Tao</a>, <a href="/search/cs?searchtype=author&query=Tao%2C+Y">Yitian Tao</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+H">Haowen Deng</a>, <a href="/search/cs?searchtype=author&query=Cai%2C+X">Xinyi Cai</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Gaofeng Wu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kaidong Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+H">Haifeng Tang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+L">Lixuan Zhu</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Z">Zhuoyang Gu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiawei Huang</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+D">Dinggang Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Han Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.10691v1-abstract-short" style="display: inline;"> The infant brain undergoes rapid development in the first few years after birth.Compared to cross-sectional studies, longitudinal studies can depict the trajectories of infants brain development with higher accuracy, statistical power and flexibility.However, the collection of infant longitudinal magnetic resonance (MR) data suffers a notorious dropout problem, resulting in incomplete datasets wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10691v1-abstract-full').style.display = 'inline'; document.getElementById('2405.10691v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.10691v1-abstract-full" style="display: none;"> The infant brain undergoes rapid development in the first few years after birth.Compared to cross-sectional studies, longitudinal studies can depict the trajectories of infants brain development with higher accuracy, statistical power and flexibility.However, the collection of infant longitudinal magnetic resonance (MR) data suffers a notorious dropout problem, resulting in incomplete datasets with missing time points. This limitation significantly impedes subsequent neuroscience and clinical modeling. Yet, existing deep generative models are facing difficulties in missing brain image completion, due to sparse data and the nonlinear, dramatic contrast/geometric variations in the developing brain. We propose LoCI-DiffCom, a novel Longitudinal Consistency-Informed Diffusion model for infant brain image Completion,which integrates the images from preceding and subsequent time points to guide a diffusion model for generating high-fidelity missing data. Our designed LoCI module can work on highly sparse sequences, relying solely on data from two temporal points. Despite wide separation and diversity between age time points, our approach can extract individualized developmental features while ensuring context-aware consistency. Our experiments on a large infant brain MR dataset demonstrate its effectiveness with consistent performance on missing infant brain MR completion even in big gap scenarios, aiding in better delineation of early developmental trajectories. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.10691v1-abstract-full').style.display = 'none'; document.getElementById('2405.10691v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Deng%2C+H&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li> <a href="/search/?searchtype=author&query=Deng%2C+H&start=250" class="pagination-link " aria-label="Page 6" aria-current="page">6 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository