Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 4,388 results for author: <span class="mathjax">Chen, J</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14427">arXiv:2411.14427</a> <span> [<a href="https://arxiv.org/pdf/2411.14427">pdf</a>, <a href="https://arxiv.org/format/2411.14427">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Transformer-based Heuristic for Advanced Air Mobility Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jun Xiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14427v1-abstract-short" style="display: inline;"> Safety is extremely important for urban flights of autonomous Unmanned Aerial Vehicles (UAVs). Risk-aware path planning is one of the most effective methods to guarantee the safety of UAVs. This type of planning can be represented as a Constrained Shortest Path (CSP) problem, which seeks to find the shortest route that meets a predefined safety constraint. Solving CSP problems is NP-hard, presenti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14427v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14427v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14427v1-abstract-full" style="display: none;"> Safety is extremely important for urban flights of autonomous Unmanned Aerial Vehicles (UAVs). Risk-aware path planning is one of the most effective methods to guarantee the safety of UAVs. This type of planning can be represented as a Constrained Shortest Path (CSP) problem, which seeks to find the shortest route that meets a predefined safety constraint. Solving CSP problems is NP-hard, presenting significant computational challenges. Although traditional methods can accurately solve CSP problems, they tend to be very slow. Previously, we introduced an additional safety dimension to the traditional A* algorithm, known as ASD A*, to effectively handle Constrained Shortest Path (CSP) problems. Then, we developed a custom learning-based heuristic using transformer-based neural networks, which significantly reduced computational load and enhanced the performance of the ASD A* algorithm. In this paper, we expand our dataset to include more risk maps and tasks, improve the proposed model, and increase its performance. We also introduce a new heuristic strategy and a novel neural network, which enhance the overall effectiveness of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14427v1-abstract-full').style.display = 'none'; document.getElementById('2411.14427v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2024 AIAA DATC/IEEE 43rd Digital Avionics Systems Conference (DASC)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14403">arXiv:2411.14403</a> <span> [<a href="https://arxiv.org/pdf/2411.14403">pdf</a>, <a href="https://arxiv.org/format/2411.14403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Landing Trajectory Prediction for UAS Based on Generative Adversarial Network </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jun Xiang</a>, <a href="/search/cs?searchtype=author&query=Essick%2C+D">Drake Essick</a>, <a href="/search/cs?searchtype=author&query=Bautista%2C+L+G">Luiz Gonzalez Bautista</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Junfei Xie</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14403v1-abstract-short" style="display: inline;"> Models for trajectory prediction are an essential component of many advanced air mobility studies. These models help aircraft detect conflict and plan avoidance maneuvers, which is especially important in Unmanned Aircraft systems (UAS) landing management due to the congested airspace near vertiports. In this paper, we propose a landing trajectory prediction model for UAS based on Generative Adver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14403v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14403v1-abstract-full" style="display: none;"> Models for trajectory prediction are an essential component of many advanced air mobility studies. These models help aircraft detect conflict and plan avoidance maneuvers, which is especially important in Unmanned Aircraft systems (UAS) landing management due to the congested airspace near vertiports. In this paper, we propose a landing trajectory prediction model for UAS based on Generative Adversarial Network (GAN). The GAN is a prestigious neural network that has been developed for many years. In previous research, GAN has achieved many state-of-the-art results in many generation tasks. The GAN consists of one neural network generator and a neural network discriminator. Because of the learning capacity of the neural networks, the generator is capable to understand the features of the sample trajectory. The generator takes the previous trajectory as input and outputs some random status of a flight. According to the results of the experiences, the proposed model can output more accurate predictions than the baseline method(GMR) in various datasets. To evaluate the proposed model, we also create a real UAV landing dataset that includes more than 2600 trajectories of drone control manually by real pilots. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14403v1-abstract-full').style.display = 'none'; document.getElementById('2411.14403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">9 pages, AIAA SCITECH 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14356">arXiv:2411.14356</a> <span> [<a href="https://arxiv.org/pdf/2411.14356">pdf</a>, <a href="https://arxiv.org/format/2411.14356">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Convex Approximation of Probabilistic Reachable Sets from Small Samples Using Self-supervised Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiang%2C+J">Jun Xiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jun Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14356v1-abstract-short" style="display: inline;"> Probabilistic Reachable Set (PRS) plays a crucial role in many fields of autonomous systems, yet efficiently generating PRS remains a significant challenge. This paper presents a learning approach to generating 2-dimensional PRS for states in a dynamic system. Traditional methods such as Hamilton-Jacobi reachability analysis, Monte Carlo, and Gaussian process classification face significant comput… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14356v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14356v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14356v1-abstract-full" style="display: none;"> Probabilistic Reachable Set (PRS) plays a crucial role in many fields of autonomous systems, yet efficiently generating PRS remains a significant challenge. This paper presents a learning approach to generating 2-dimensional PRS for states in a dynamic system. Traditional methods such as Hamilton-Jacobi reachability analysis, Monte Carlo, and Gaussian process classification face significant computational challenges or require detailed dynamics information, limiting their applicability in realistic situations. Existing data-driven methods may lack accuracy. To overcome these limitations, we propose leveraging neural networks, commonly used in imitation learning and computer vision, to imitate expert methods to generate PRS approximations. We trained the neural networks using a multi-label, self-supervised learning approach. We selected the fine-tuned convex approximation method as the expert to create expert PRS. Additionally, we continued sampling from the distribution to obtain a diverse array of sample sets. Given a small sample set, the trained neural networks can replicate the PRS approximation generated by the expert method, while the generation speed is much faster. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14356v1-abstract-full').style.display = 'none'; document.getElementById('2411.14356v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13909">arXiv:2411.13909</a> <span> [<a href="https://arxiv.org/pdf/2411.13909">pdf</a>, <a href="https://arxiv.org/format/2411.13909">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Panther: Illuminate the Sight of Multimodal LLMs with Instruction-Guided Visual Prompts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">Honglin Li</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+Y">Yuting Gao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+C">Chenglu Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Lin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13909v1-abstract-short" style="display: inline;"> Multimodal large language models (MLLMs) are closing the gap to human visual perception capability rapidly, while, still lag behind on attending to subtle images details or locating small objects precisely, etc. Common schemes to tackle these issues include deploying multiple vision encoders or operating on original high-resolution images. Few studies have concentrated on taking the textual instru… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13909v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13909v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13909v1-abstract-full" style="display: none;"> Multimodal large language models (MLLMs) are closing the gap to human visual perception capability rapidly, while, still lag behind on attending to subtle images details or locating small objects precisely, etc. Common schemes to tackle these issues include deploying multiple vision encoders or operating on original high-resolution images. Few studies have concentrated on taking the textual instruction into improving visual representation, resulting in losing focus in some vision-centric tasks, a phenomenon we herein termed as Amblyopia. In this work, we introduce Panther, a MLLM that closely adheres to user instruction and locates targets of interests precisely, with the finesse of a black panther. Specifically, Panther comprises three integral components: Panther-VE, Panther-Bridge, and Panther-Decoder. Panther-VE integrates user instruction information at the early stages of the vision encoder, thereby extracting the most relevant and useful visual representations. The Panther-Bridge module, equipped with powerful filtering capabilities, significantly reduces redundant visual information, leading to a substantial savings in training costs. The Panther-Decoder is versatile and can be employed with any decoder-only architecture of LLMs without discrimination. Experimental results, particularly on vision-centric benchmarks, have demonstrated the effectiveness of Panther. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13909v1-abstract-full').style.display = 'none'; document.getElementById('2411.13909v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13900">arXiv:2411.13900</a> <span> [<a href="https://arxiv.org/pdf/2411.13900">pdf</a>, <a href="https://arxiv.org/format/2411.13900">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> </div> </div> <p class="title is-5 mathjax"> Dissecting Conditional Branch Predictors of Apple Firestorm and Qualcomm Oryon for Software Optimization and Architectural Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajie Chen</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+P">Peng Qu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youhui Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13900v1-abstract-short" style="display: inline;"> Branch predictor (BP) is a critical component of modern processors, and its accurate modeling is essential for compilers and applications. However, processor vendors have disclosed limited details about their BP implementations. Recent advancements in reverse engineering the BP of general-purpose processors have enabled the creation of more accurate BP models. Nonetheless, we have identified cri… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13900v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13900v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13900v1-abstract-full" style="display: none;"> Branch predictor (BP) is a critical component of modern processors, and its accurate modeling is essential for compilers and applications. However, processor vendors have disclosed limited details about their BP implementations. Recent advancements in reverse engineering the BP of general-purpose processors have enabled the creation of more accurate BP models. Nonetheless, we have identified critical deficiencies in the existing methods. For instance, they impose strong assumptions on the branch history update function and the index/tag functions of key BP components, limiting their applicability to a broader range of processors, including those from Apple and Qualcomm. In this paper, we design a more general branch prediction reverse engineering pipeline that can additionally recover the conditional branch predictors (CBPs) of Apple Firestorm and Qualcomm Oryon microarchitectures, and subsequently build accurate CBP models. Leveraging these models, we uncover two previously undisclosed effects that impair branch prediction accuracy and propose related solutions, resulting in up to 14% MPKI reduction and 7% performance improvement in representative applications. Furthermore, we conduct a comprehensive comparison of the known Intel/Apple/Qualcomm CBPs using a unified standalone branch predictor simulator, which facilitates a deeper understanding of CBP behavior. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13900v1-abstract-full').style.display = 'none'; document.getElementById('2411.13900v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13881">arXiv:2411.13881</a> <span> [<a href="https://arxiv.org/pdf/2411.13881">pdf</a>, <a href="https://arxiv.org/format/2411.13881">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Analysis, Statistics and Probability">physics.data-an</span> </div> </div> <p class="title is-5 mathjax"> Exploring applications of topological data analysis in stock index movement prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+D">Dazhi Huang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+P">Pengcheng Xu</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiaocheng Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayi Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13881v1-abstract-short" style="display: inline;"> Topological Data Analysis (TDA) has recently gained significant attention in the field of financial prediction. However, the choice of point cloud construction methods, topological feature representations, and classification models has a substantial impact on prediction results. This paper addresses the classification problem of stock index movement. First, we construct point clouds for stock indi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13881v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13881v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13881v1-abstract-full" style="display: none;"> Topological Data Analysis (TDA) has recently gained significant attention in the field of financial prediction. However, the choice of point cloud construction methods, topological feature representations, and classification models has a substantial impact on prediction results. This paper addresses the classification problem of stock index movement. First, we construct point clouds for stock indices using three different methods. Next, we apply TDA to extract topological structures from the point clouds. Four distinct topological features are computed to represent the patterns in the data, and 15 combinations of these features are enumerated and input into six different machine learning models. We evaluate the predictive performance of various TDA configurations by conducting index movement classification tasks on datasets such as CSI, DAX, HSI and FTSE providing insights into the efficiency of different TDA setups. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13881v1-abstract-full').style.display = 'none'; document.getElementById('2411.13881v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13874">arXiv:2411.13874</a> <span> [<a href="https://arxiv.org/pdf/2411.13874">pdf</a>, <a href="https://arxiv.org/format/2411.13874">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Next-Generation Phishing: How LLM Agents Empower Cyber Attackers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Afane%2C+K">Khalifa Afane</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+W">Wenqi Wei</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+Y">Ying Mao</a>, <a href="/search/cs?searchtype=author&query=Farooq%2C+J">Junaid Farooq</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Juntao Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13874v1-abstract-short" style="display: inline;"> The escalating threat of phishing emails has become increasingly sophisticated with the rise of Large Language Models (LLMs). As attackers exploit LLMs to craft more convincing and evasive phishing emails, it is crucial to assess the resilience of current phishing defenses. In this study we conduct a comprehensive evaluation of traditional phishing detectors, such as Gmail Spam Filter, Apache Spam… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13874v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13874v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13874v1-abstract-full" style="display: none;"> The escalating threat of phishing emails has become increasingly sophisticated with the rise of Large Language Models (LLMs). As attackers exploit LLMs to craft more convincing and evasive phishing emails, it is crucial to assess the resilience of current phishing defenses. In this study we conduct a comprehensive evaluation of traditional phishing detectors, such as Gmail Spam Filter, Apache SpamAssassin, and Proofpoint, as well as machine learning models like SVM, Logistic Regression, and Naive Bayes, in identifying both traditional and LLM-rephrased phishing emails. We also explore the emerging role of LLMs as phishing detection tools, a method already adopted by companies like NTT Security Holdings and JPMorgan Chase. Our results reveal notable declines in detection accuracy for rephrased emails across all detectors, highlighting critical weaknesses in current phishing defenses. As the threat landscape evolves, our findings underscore the need for stronger security controls and regulatory oversight on LLM-generated content to prevent its misuse in creating advanced phishing attacks. This study contributes to the development of more effective Cyber Threat Intelligence (CTI) by leveraging LLMs to generate diverse phishing variants that can be used for data augmentation, harnessing the power of LLMs to enhance phishing detection, and paving the way for more robust and adaptable threat detection systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13874v1-abstract-full').style.display = 'none'; document.getElementById('2411.13874v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13797">arXiv:2411.13797</a> <span> [<a href="https://arxiv.org/pdf/2411.13797">pdf</a>, <a href="https://arxiv.org/format/2411.13797">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Hugging Rain Man: A Novel Facial Action Units Dataset for Analyzing Atypical Facial Expressions in Children with Autism Spectrum Disorder </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ji%2C+Y">Yanfeng Ji</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shutong Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ruyi Xu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingying Chen</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xinzhou Jiang</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhengyu Deng</a>, <a href="/search/cs?searchtype=author&query=Quan%2C+Y">Yuxuan Quan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junpeng Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13797v1-abstract-short" style="display: inline;"> Children with Autism Spectrum Disorder (ASD) often exhibit atypical facial expressions. However, the specific objective facial features that underlie this subjective perception remain unclear. In this paper, we introduce a novel dataset, Hugging Rain Man (HRM), which includes facial action units (AUs) manually annotated by FACS experts for both children with ASD and typical development (TD). The d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13797v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13797v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13797v1-abstract-full" style="display: none;"> Children with Autism Spectrum Disorder (ASD) often exhibit atypical facial expressions. However, the specific objective facial features that underlie this subjective perception remain unclear. In this paper, we introduce a novel dataset, Hugging Rain Man (HRM), which includes facial action units (AUs) manually annotated by FACS experts for both children with ASD and typical development (TD). The dataset comprises a rich collection of posed and spontaneous facial expressions, totaling approximately 130,000 frames, along with 22 AUs, 10 Action Descriptors (ADs), and atypicality ratings. A statistical analysis of static images from the HRM reveals significant differences between the ASD and TD groups across multiple AUs and ADs when displaying the same emotional expressions, confirming that participants with ASD tend to demonstrate more irregular and diverse expression patterns. Subsequently, a temporal regression method was presented to analyze atypicality of dynamic sequences, thereby bridging the gap between subjective perception and objective facial characteristics. Furthermore, baseline results for AU detection are provided for future research reference. This work not only contributes to our understanding of the unique facial expression characteristics associated with ASD but also provides potential tools for ASD early screening. Portions of the dataset, features, and pretrained models are accessible at: \url{https://github.com/Jonas-DL/Hugging-Rain-Man}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13797v1-abstract-full').style.display = 'none'; document.getElementById('2411.13797v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Portions of the dataset, features, and pretrained models are accessible at: https://github.com/Jonas-DL/Hugging-Rain-Man</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13389">arXiv:2411.13389</a> <span> [<a href="https://arxiv.org/pdf/2411.13389">pdf</a>, <a href="https://arxiv.org/format/2411.13389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> UKFin+: A Research Agenda for Financial Services </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Elliott%2C+K">Karen Elliott</a>, <a href="/search/cs?searchtype=author&query=Knottenbelt%2C+W">William Knottenbelt</a>, <a href="/search/cs?searchtype=author&query=van+Moorsel%2C+A">Aad van Moorsel</a>, <a href="/search/cs?searchtype=author&query=Orpin%2C+H">Helen Orpin</a>, <a href="/search/cs?searchtype=author&query=Robertson%2C+S">Sheena Robertson</a>, <a href="/search/cs?searchtype=author&query=Vines%2C+J">John Vines</a>, <a href="/search/cs?searchtype=author&query=Wolter%2C+K">Katinka Wolter</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13389v1-abstract-short" style="display: inline;"> This document presents a research agenda for financial services as a deliverable of UKFin+, a Network Plus grant funded by the Engineering and Physical Sciences Research Council. UKFin+ fosters research collaborations between academic and non-academic partners directed at tackling complex long-term challenges relevant to the UK's financial services sector. Confronting these challenges is crucial t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13389v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13389v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13389v1-abstract-full" style="display: none;"> This document presents a research agenda for financial services as a deliverable of UKFin+, a Network Plus grant funded by the Engineering and Physical Sciences Research Council. UKFin+ fosters research collaborations between academic and non-academic partners directed at tackling complex long-term challenges relevant to the UK's financial services sector. Confronting these challenges is crucial to promote the long-term health and international competitiveness of the UK's financial services industry. As one route to impact, UKFin+ includes dedicated funding streams for research collaborations between academic researchers and non-academic organisations. The intended audience of this document includes researchers based in academia, academic funders, as well as practitioners based in industry, regulators, charities or NGOs. It is not intended to be comprehensive or exhaustive in scope but may provide applicants to UKFin+ funding streams and other funding bodies with inspiration for their proposals or at least an understanding of how their proposals align with the broader needs of the UK financial services industry. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13389v1-abstract-full').style.display = 'none'; document.getElementById('2411.13389v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13383">arXiv:2411.13383</a> <span> [<a href="https://arxiv.org/pdf/2411.13383">pdf</a>, <a href="https://arxiv.org/format/2411.13383">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Adversarial Diffusion Compression for Real-World Image Super-Resolution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+B">Bin Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gehui Li</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+R">Rongyuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xindong Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13383v1-abstract-short" style="display: inline;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still inc… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13383v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13383v1-abstract-full" style="display: none;"> Real-world image super-resolution (Real-ISR) aims to reconstruct high-resolution images from low-resolution inputs degraded by complex, unknown processes. While many Stable Diffusion (SD)-based Real-ISR methods have achieved remarkable success, their slow, multi-step inference hinders practical deployment. Recent SD-based one-step networks like OSEDiff and S3Diff alleviate this issue but still incur high computational costs due to their reliance on large pretrained SD models. This paper proposes a novel Real-ISR method, AdcSR, by distilling the one-step diffusion network OSEDiff into a streamlined diffusion-GAN model under our Adversarial Diffusion Compression (ADC) framework. We meticulously examine the modules of OSEDiff, categorizing them into two types: (1) Removable (VAE encoder, prompt extractor, text encoder, etc.) and (2) Prunable (denoising UNet and VAE decoder). Since direct removal and pruning can degrade the model's generation capability, we pretrain our pruned VAE decoder to restore its ability to decode images and employ adversarial distillation to compensate for performance loss. This ADC-based diffusion-GAN hybrid design effectively reduces complexity by 73% in inference time, 78% in computation, and 74% in parameters, while preserving the model's generation capability. Experiments manifest that our proposed AdcSR achieves competitive recovery quality on both synthetic and real-world datasets, offering up to 9.3$\times$ speedup over previous one-step diffusion-based methods. Code and models will be made available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13383v1-abstract-full').style.display = 'none'; document.getElementById('2411.13383v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13136">arXiv:2411.13136</a> <span> [<a href="https://arxiv.org/pdf/2411.13136">pdf</a>, <a href="https://arxiv.org/format/2411.13136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> TAPT: Test-Time Adversarial Prompt Tuning for Robust Inference in Vision-Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xin Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+K">Kai Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiaming Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+X">Xingjun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13136v1-abstract-short" style="display: inline;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13136v1-abstract-full" style="display: none;"> Large pre-trained Vision-Language Models (VLMs) such as CLIP have demonstrated excellent zero-shot generalizability across various downstream tasks. However, recent studies have shown that the inference performance of CLIP can be greatly degraded by small adversarial perturbations, especially its visual modality, posing significant safety threats. To mitigate this vulnerability, in this paper, we propose a novel defense method called Test-Time Adversarial Prompt Tuning (TAPT) to enhance the inference robustness of CLIP against visual adversarial attacks. TAPT is a test-time defense method that learns defensive bimodal (textual and visual) prompts to robustify the inference process of CLIP. Specifically, it is an unsupervised method that optimizes the defensive prompts for each test sample by minimizing a multi-view entropy and aligning adversarial-clean distributions. We evaluate the effectiveness of TAPT on 11 benchmark datasets, including ImageNet and 10 other zero-shot datasets, demonstrating that it enhances the zero-shot adversarial robustness of the original CLIP by at least 48.9% against AutoAttack (AA), while largely maintaining performance on clean examples. Moreover, TAPT outperforms existing adversarial prompt tuning methods across various backbones, achieving an average robustness improvement of at least 36.6%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13136v1-abstract-full').style.display = 'none'; document.getElementById('2411.13136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12892">arXiv:2411.12892</a> <span> [<a href="https://arxiv.org/pdf/2411.12892">pdf</a>, <a href="https://arxiv.org/format/2411.12892">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Selective Attention: Enhancing Transformer through Principled Context Control </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xuechen Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+X">Xiangyu Chang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingchen Li</a>, <a href="/search/cs?searchtype=author&query=Roy-Chowdhury%2C+A">Amit Roy-Chowdhury</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiasi Chen</a>, <a href="/search/cs?searchtype=author&query=Oymak%2C+S">Samet Oymak</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12892v1-abstract-short" style="display: inline;"> The attention mechanism within the transformer architecture enables the model to weigh and combine tokens based on their relevance to the query. While self-attention has enjoyed major success, it notably treats all queries $q$ in the same way by applying the mapping $V^\top\text{softmax}(Kq)$, where $V,K$ are the value and key embeddings respectively. In this work, we argue that this uniform treat… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12892v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12892v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12892v1-abstract-full" style="display: none;"> The attention mechanism within the transformer architecture enables the model to weigh and combine tokens based on their relevance to the query. While self-attention has enjoyed major success, it notably treats all queries $q$ in the same way by applying the mapping $V^\top\text{softmax}(Kq)$, where $V,K$ are the value and key embeddings respectively. In this work, we argue that this uniform treatment hinders the ability to control contextual sparsity and relevance. As a solution, we introduce the $\textit{Selective Self-Attention}$ (SSA) layer that augments the softmax nonlinearity with a principled temperature scaling strategy. By controlling temperature, SSA adapts the contextual sparsity of the attention map to the query embedding and its position in the context window. Through theory and experiments, we demonstrate that this alleviates attention dilution, aids the optimization process, and enhances the model's ability to control softmax spikiness of individual queries. We also incorporate temperature scaling for value embeddings and show that it boosts the model's ability to suppress irrelevant/noisy tokens. Notably, SSA is a lightweight method which introduces less than 0.5% new parameters through a weight-sharing strategy and can be fine-tuned on existing LLMs. Extensive empirical evaluations demonstrate that SSA-equipped models achieve a noticeable and consistent accuracy improvement on language modeling benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12892v1-abstract-full').style.display = 'none'; document.getElementById('2411.12892v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">JingWen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v1-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v1-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v1-abstract-full').style.display = 'none'; document.getElementById('2411.12814v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12787">arXiv:2411.12787</a> <span> [<a href="https://arxiv.org/pdf/2411.12787">pdf</a>, <a href="https://arxiv.org/format/2411.12787">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Visual Cue Enhancement and Dual Low-Rank Adaptation for Efficient Visual Instruction Fine-Tuning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiao%2C+P">Pengkun Jiao</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Ngo%2C+C">Chong-Wah Ngo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12787v1-abstract-short" style="display: inline;"> Fine-tuning multimodal large language models (MLLMs) presents significant challenges, including a reliance on high-level visual features that limits fine-grained detail comprehension, and data conflicts that arise from task complexity. To address these issues, we propose an efficient fine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE) and Dual Low-Rank Adaptation (Dual-Lo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12787v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12787v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12787v1-abstract-full" style="display: none;"> Fine-tuning multimodal large language models (MLLMs) presents significant challenges, including a reliance on high-level visual features that limits fine-grained detail comprehension, and data conflicts that arise from task complexity. To address these issues, we propose an efficient fine-tuning framework with two novel approaches: Vision Cue Enhancement (VCE) and Dual Low-Rank Adaptation (Dual-LoRA). VCE enhances the vision projector by integrating multi-level visual cues, improving the model's ability to capture fine-grained visual features. Dual-LoRA introduces a dual low-rank structure for instruction tuning, decoupling learning into skill and task spaces to enable precise control and efficient adaptation across diverse tasks. Our method simplifies implementation, enhances visual comprehension, and improves adaptability. Experiments on both downstream tasks and general benchmarks demonstrate the effectiveness of our proposed approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12787v1-abstract-full').style.display = 'none'; document.getElementById('2411.12787v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12726">arXiv:2411.12726</a> <span> [<a href="https://arxiv.org/pdf/2411.12726">pdf</a>, <a href="https://arxiv.org/format/2411.12726">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Numerical Analysis">math.NA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation">stat.CO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> LazyDINO: Fast, scalable, and efficiently amortized Bayesian inversion via structure-exploiting and surrogate-driven measure transport </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+L">Lianghao Cao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Joshua Chen</a>, <a href="/search/cs?searchtype=author&query=Brennan%2C+M">Michael Brennan</a>, <a href="/search/cs?searchtype=author&query=O%27Leary-Roseberry%2C+T">Thomas O'Leary-Roseberry</a>, <a href="/search/cs?searchtype=author&query=Marzouk%2C+Y">Youssef Marzouk</a>, <a href="/search/cs?searchtype=author&query=Ghattas%2C+O">Omar Ghattas</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12726v1-abstract-short" style="display: inline;"> We present LazyDINO, a transport map variational inference method for fast, scalable, and efficiently amortized solutions of high-dimensional nonlinear Bayesian inverse problems with expensive parameter-to-observable (PtO) maps. Our method consists of an offline phase in which we construct a derivative-informed neural surrogate of the PtO map using joint samples of the PtO map and its Jacobian. Du… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12726v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12726v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12726v1-abstract-full" style="display: none;"> We present LazyDINO, a transport map variational inference method for fast, scalable, and efficiently amortized solutions of high-dimensional nonlinear Bayesian inverse problems with expensive parameter-to-observable (PtO) maps. Our method consists of an offline phase in which we construct a derivative-informed neural surrogate of the PtO map using joint samples of the PtO map and its Jacobian. During the online phase, when given observational data, we seek rapid posterior approximation using surrogate-driven training of a lazy map [Brennan et al., NeurIPS, (2020)], i.e., a structure-exploiting transport map with low-dimensional nonlinearity. The trained lazy map then produces approximate posterior samples or density evaluations. Our surrogate construction is optimized for amortized Bayesian inversion using lazy map variational inference. We show that (i) the derivative-based reduced basis architecture [O'Leary-Roseberry et al., Comput. Methods Appl. Mech. Eng., 388 (2022)] minimizes the upper bound on the expected error in surrogate posterior approximation, and (ii) the derivative-informed training formulation [O'Leary-Roseberry et al., J. Comput. Phys., 496 (2024)] minimizes the expected error due to surrogate-driven transport map optimization. Our numerical results demonstrate that LazyDINO is highly efficient in cost amortization for Bayesian inversion. We observe one to two orders of magnitude reduction of offline cost for accurate posterior approximation, compared to simulation-based amortized inference via conditional transport and conventional surrogate-driven transport. In particular, LazyDINO outperforms Laplace approximation consistently using fewer than 1000 offline samples, while other amortized inference methods struggle and sometimes fail at 16,000 offline samples. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12726v1-abstract-full').style.display = 'none'; document.getElementById('2411.12726v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12195">arXiv:2411.12195</a> <span> [<a href="https://arxiv.org/pdf/2411.12195">pdf</a>, <a href="https://arxiv.org/format/2411.12195">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Survey of Medical Vision-and-Language Applications and Their Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Q">Qi Chen</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruoshan Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sinuo Wang</a>, <a href="/search/cs?searchtype=author&query=Phan%2C+V+M+H">Vu Minh Hieu Phan</a>, <a href="/search/cs?searchtype=author&query=Hengel%2C+A+v+d">Anton van den Hengel</a>, <a href="/search/cs?searchtype=author&query=Verjans%2C+J">Johan Verjans</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhibin Liao</a>, <a href="/search/cs?searchtype=author&query=To%2C+M">Minh-Son To</a>, <a href="/search/cs?searchtype=author&query=Xia%2C+Y">Yong Xia</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yutong Xie</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Q">Qi Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12195v1-abstract-short" style="display: inline;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and p… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12195v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12195v1-abstract-full" style="display: none;"> Medical vision-and-language models (MVLMs) have attracted substantial interest due to their capability to offer a natural language interface for interpreting complex medical data. Their applications are versatile and have the potential to improve diagnostic accuracy and decision-making for individual patients while also contributing to enhanced public health monitoring, disease surveillance, and policy-making through more efficient analysis of large data sets. MVLMS integrate natural language processing with medical images to enable a more comprehensive and contextual understanding of medical images alongside their corresponding textual information. Unlike general vision-and-language models trained on diverse, non-specialized datasets, MVLMs are purpose-built for the medical domain, automatically extracting and interpreting critical information from medical images and textual reports to support clinical decision-making. Popular clinical applications of MVLMs include automated medical report generation, medical visual question answering, medical multimodal segmentation, diagnosis and prognosis and medical image-text retrieval. Here, we provide a comprehensive overview of MVLMs and the various medical tasks to which they have been applied. We conduct a detailed analysis of various vision-and-language model architectures, focusing on their distinct strategies for cross-modal integration/exploitation of medical visual and textual features. We also examine the datasets used for these tasks and compare the performance of different models based on standardized evaluation metrics. Furthermore, we highlight potential challenges and summarize future research trends and directions. The full collection of papers and codes is available at: https://github.com/YtongXie/Medical-Vision-and-Language-Tasks-and-Methodologies-A-Survey. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12195v1-abstract-full').style.display = 'none'; document.getElementById('2411.12195v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12181">arXiv:2411.12181</a> <span> [<a href="https://arxiv.org/pdf/2411.12181">pdf</a>, <a href="https://arxiv.org/format/2411.12181">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Enhancing Low Dose Computed Tomography Images Using Consistency Training Techniques </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gokmen%2C+M+S">Mahmut S. Gokmen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jie Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Ge Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jin Chen</a>, <a href="/search/cs?searchtype=author&query=Bumgardner%2C+C">Cody Bumgardner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12181v1-abstract-short" style="display: inline;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12181v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12181v1-abstract-full" style="display: none;"> Diffusion models have significant impact on wide range of generative tasks, especially on image inpainting and restoration. Although the improvements on aiming for decreasing number of function evaluations (NFE), the iterative results are still computationally expensive. Consistency models are as a new family of generative models, enable single-step sampling of high quality data without the need for adversarial training. In this paper, we introduce the beta noise distribution, which provides flexibility in adjusting noise levels. This is combined with a sinusoidal curriculum that enhances the learning of the trajectory between the noise distribution and the posterior distribution of interest, allowing High Noise Improved Consistency Training (HN-iCT) to be trained in a supervised fashion. Additionally, High Noise Improved Consistency Training with Image Condition (HN-iCT-CN) architecture is introduced, enables to take Low Dose images as a condition for extracting significant features by Weighted Attention Gates (WAG).Our results indicate that unconditional image generation using HN-iCT significantly outperforms basic CT and iCT training techniques with NFE=1 on the CIFAR10 and CelebA datasets. Moreover, our image-conditioned model demonstrates exceptional performance in enhancing low-dose (LD) CT scans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12181v1-abstract-full').style.display = 'none'; document.getElementById('2411.12181v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12157">arXiv:2411.12157</a> <span> [<a href="https://arxiv.org/pdf/2411.12157">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Combined Encoder and Transformer Approach for Coherent and High-Quality Text Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiajing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuo Wang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Z">Zhen Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhenhong Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chihang Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hongye Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12157v1-abstract-short" style="display: inline;"> This research introduces a novel text generation model that combines BERT's semantic interpretation strengths with GPT-4's generative capabilities, establishing a high standard in generating coherent, contextually accurate language. Through the combined architecture, the model enhances semantic depth and maintains smooth, human-like text flow, overcoming limitations seen in prior models. Experimen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12157v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12157v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12157v1-abstract-full" style="display: none;"> This research introduces a novel text generation model that combines BERT's semantic interpretation strengths with GPT-4's generative capabilities, establishing a high standard in generating coherent, contextually accurate language. Through the combined architecture, the model enhances semantic depth and maintains smooth, human-like text flow, overcoming limitations seen in prior models. Experimental benchmarks reveal that BERT-GPT-4 surpasses traditional models, including GPT-3, T5, BART, Transformer-XL, and CTRL, in key metrics like Perplexity and BLEU, showcasing its superior natural language generation performance. By fully utilizing contextual information, this hybrid model generates text that is not only logically coherent but also aligns closely with human language patterns, providing an advanced solution for text generation tasks. This research highlights the potential of integrating semantic understanding with advanced generative models, contributing new insights for NLP, and setting a foundation for broader applications of large-scale generative architectures in areas such as automated writing, question-answer systems, and adaptive conversational agents. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12157v1-abstract-full').style.display = 'none'; document.getElementById('2411.12157v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12146">arXiv:2411.12146</a> <span> [<a href="https://arxiv.org/pdf/2411.12146">pdf</a>, <a href="https://arxiv.org/format/2411.12146">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Self-supervised denoising of visual field data improves detection of glaucoma progression </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+S">Sean Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+Y">Jun Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Mohammadzadeh%2C+V">Vahid Mohammadzadeh</a>, <a href="/search/cs?searchtype=author&query=Besharati%2C+S">Sajad Besharati</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+J">Jaewon Lee</a>, <a href="/search/cs?searchtype=author&query=Nouri-Mahdavi%2C+K">Kouros Nouri-Mahdavi</a>, <a href="/search/cs?searchtype=author&query=Caprioli%2C+J">Joseph Caprioli</a>, <a href="/search/cs?searchtype=author&query=Fei%2C+Z">Zhe Fei</a>, <a href="/search/cs?searchtype=author&query=Scalzo%2C+F">Fabien Scalzo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12146v1-abstract-short" style="display: inline;"> Perimetric measurements provide insight into a patient's peripheral vision and day-to-day functioning and are the main outcome measure for identifying progression of visual damage from glaucoma. However, visual field data can be noisy, exhibiting high variance, especially with increasing damage. In this study, we demonstrate the utility of self-supervised deep learning in denoising visual field da… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12146v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12146v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12146v1-abstract-full" style="display: none;"> Perimetric measurements provide insight into a patient's peripheral vision and day-to-day functioning and are the main outcome measure for identifying progression of visual damage from glaucoma. However, visual field data can be noisy, exhibiting high variance, especially with increasing damage. In this study, we demonstrate the utility of self-supervised deep learning in denoising visual field data from over 4000 patients to enhance its signal-to-noise ratio and its ability to detect true glaucoma progression. We deployed both a variational autoencoder (VAE) and a masked autoencoder to determine which self-supervised model best smooths the visual field data while reconstructing salient features that are less noisy and more predictive of worsening disease. Our results indicate that including a categorical p-value at every visual field location improves the smoothing of visual field data. Masked autoencoders led to cleaner denoised data than previous methods, such as variational autoencoders. A 4.7% increase in detection of progressing eyes with pointwise linear regression (PLR) was observed. The masked and variational autoencoders' smoothed data predicted glaucoma progression 2.3 months earlier when p-values were included compared to when they were not. The faster prediction of time to progression (TTP) and the higher percentage progression detected support our hypothesis that masking out visual field elements during training while including p-values at each location would improve the task of detection of visual field progression. Our study has clinically relevant implications regarding masking when training neural networks to denoise visual field data, resulting in earlier and more accurate detection of glaucoma progression. This denoising model can be integrated into future models for visual field analysis to enhance detection of glaucoma progression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12146v1-abstract-full').style.display = 'none'; document.getElementById('2411.12146v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12142">arXiv:2411.12142</a> <span> [<a href="https://arxiv.org/pdf/2411.12142">pdf</a>, <a href="https://arxiv.org/format/2411.12142">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Computational Method for Measuring "Open Codes" in Qualitative Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">John Chen</a>, <a href="/search/cs?searchtype=author&query=Lotsos%2C+A">Alexandros Lotsos</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Lexie Zhao</a>, <a href="/search/cs?searchtype=author&query=Hullman%2C+J">Jessica Hullman</a>, <a href="/search/cs?searchtype=author&query=Sherin%2C+B">Bruce Sherin</a>, <a href="/search/cs?searchtype=author&query=Wilensky%2C+U">Uri Wilensky</a>, <a href="/search/cs?searchtype=author&query=Horn%2C+M">Michael Horn</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12142v1-abstract-short" style="display: inline;"> Qualitative analysis is critical to understanding human datasets in many social science disciplines. Open coding is an inductive qualitative process that identifies and interprets "open codes" from datasets. Yet, meeting methodological expectations (such as "as exhaustive as possible") can be challenging. While many machine learning (ML)/generative AI (GAI) studies have attempted to support open c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12142v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12142v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12142v1-abstract-full" style="display: none;"> Qualitative analysis is critical to understanding human datasets in many social science disciplines. Open coding is an inductive qualitative process that identifies and interprets "open codes" from datasets. Yet, meeting methodological expectations (such as "as exhaustive as possible") can be challenging. While many machine learning (ML)/generative AI (GAI) studies have attempted to support open coding, few have systematically measured or evaluated GAI outcomes, increasing potential bias risks. Building on Grounded Theory and Thematic Analysis theories, we present a computational method to measure and identify potential biases from "open codes" systematically. Instead of operationalizing human expert results as the "ground truth," our method is built upon a team-based approach between human and machine coders. We experiment with two HCI datasets to establish this method's reliability by 1) comparing it with human analysis, and 2) analyzing its output stability. We present evidence-based suggestions and example workflows for ML/GAI to support open coding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12142v1-abstract-full').style.display = 'none'; document.getElementById('2411.12142v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12136">arXiv:2411.12136</a> <span> [<a href="https://arxiv.org/pdf/2411.12136">pdf</a>, <a href="https://arxiv.org/format/2411.12136">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Visualizing Loss Functions as Topological Landscape Profiles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geniesse%2C+C">Caleb Geniesse</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaqing Chen</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tiankai Xie</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+G">Ge Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaoqing Yang</a>, <a href="/search/cs?searchtype=author&query=Morozov%2C+D">Dmitriy Morozov</a>, <a href="/search/cs?searchtype=author&query=Perciano%2C+T">Talita Perciano</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+R">Ross Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+G+H">Gunther H. Weber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12136v1-abstract-short" style="display: inline;"> In machine learning, a loss function measures the difference between model predictions and ground-truth (or target) values. For neural network models, visualizing how this loss changes as model parameters are varied can provide insights into the local structure of the so-called loss landscape (e.g., smoothness) as well as global properties of the underlying model (e.g., generalization performance)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12136v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12136v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12136v1-abstract-full" style="display: none;"> In machine learning, a loss function measures the difference between model predictions and ground-truth (or target) values. For neural network models, visualizing how this loss changes as model parameters are varied can provide insights into the local structure of the so-called loss landscape (e.g., smoothness) as well as global properties of the underlying model (e.g., generalization performance). While various methods for visualizing the loss landscape have been proposed, many approaches limit sampling to just one or two directions, ignoring potentially relevant information in this extremely high-dimensional space. This paper introduces a new representation based on topological data analysis that enables the visualization of higher-dimensional loss landscapes. After describing this new topological landscape profile representation, we show how the shape of loss landscapes can reveal new details about model performance and learning dynamics, highlighting several use cases, including image segmentation (e.g., UNet) and scientific machine learning (e.g., physics-informed neural networks). Through these examples, we provide new insights into how loss landscapes vary across distinct hyperparameter spaces: we find that the topology of the loss landscape is simpler for better-performing models; and we observe greater variation in the shape of loss landscapes near transitions from low to high model performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12136v1-abstract-full').style.display = 'none'; document.getElementById('2411.12136v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11943">arXiv:2411.11943</a> <span> [<a href="https://arxiv.org/pdf/2411.11943">pdf</a>, <a href="https://arxiv.org/format/2411.11943">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Medical Video Generation for Disease Progression Simulation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xu Cao</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kaizhao Liang</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+K">Kuei-Da Liao</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+T">Tianren Gao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+W">Wenqian Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+Z">Zhiguang Ding</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+J">Jianguo Cao</a>, <a href="/search/cs?searchtype=author&query=Rehg%2C+J+M">James M. Rehg</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jimeng Sun</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11943v1-abstract-short" style="display: inline;"> Modeling disease progression is crucial for improving the quality and efficacy of clinical diagnosis and prognosis, but it is often hindered by a lack of longitudinal medical image monitoring for individual patients. To address this challenge, we propose the first Medical Video Generation (MVG) framework that enables controlled manipulation of disease-related image and video features, allowing pre… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11943v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11943v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11943v1-abstract-full" style="display: none;"> Modeling disease progression is crucial for improving the quality and efficacy of clinical diagnosis and prognosis, but it is often hindered by a lack of longitudinal medical image monitoring for individual patients. To address this challenge, we propose the first Medical Video Generation (MVG) framework that enables controlled manipulation of disease-related image and video features, allowing precise, realistic, and personalized simulations of disease progression. Our approach begins by leveraging large language models (LLMs) to recaption prompt for disease trajectory. Next, a controllable multi-round diffusion model simulates the disease progression state for each patient, creating realistic intermediate disease state sequence. Finally, a diffusion-based video transition generation model interpolates disease progression between these states. We validate our framework across three medical imaging domains: chest X-ray, fundus photography, and skin image. Our results demonstrate that MVG significantly outperforms baseline models in generating coherent and clinically plausible disease trajectories. Two user studies by veteran physicians, provide further validation and insights into the clinical utility of the generated sequences. MVG has the potential to assist healthcare providers in modeling disease trajectories, interpolating missing medical image data, and enhancing medical education through realistic, dynamic visualizations of disease progression. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11943v1-abstract-full').style.display = 'none'; document.getElementById('2411.11943v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Tech Report. The appendix will release soon. arXiv admin note: text overlap with arXiv:2309.11745</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11903">arXiv:2411.11903</a> <span> [<a href="https://arxiv.org/pdf/2411.11903">pdf</a>, <a href="https://arxiv.org/format/2411.11903">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DiHuR: Diffusion-Guided Generalizable Human Reconstruction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinnan Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+G+H">Gim Hee Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11903v1-abstract-short" style="display: inline;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11903v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11903v1-abstract-full" style="display: none;"> We introduce DiHuR, a novel Diffusion-guided model for generalizable Human 3D Reconstruction and view synthesis from sparse, minimally overlapping images. While existing generalizable human radiance fields excel at novel view synthesis, they often struggle with comprehensive 3D reconstruction. Similarly, directly optimizing implicit Signed Distance Function (SDF) fields from sparse-view images typically yields poor results due to limited overlap. To enhance 3D reconstruction quality, we propose using learnable tokens associated with SMPL vertices to aggregate sparse view features and then to guide SDF prediction. These tokens learn a generalizable prior across different identities in training datasets, leveraging the consistent projection of SMPL vertices onto similar semantic areas across various human identities. This consistency enables effective knowledge transfer to unseen identities during inference. Recognizing SMPL's limitations in capturing clothing details, we incorporate a diffusion model as an additional prior to fill in missing information, particularly for complex clothing geometries. Our method integrates two key priors in a coherent manner: the prior from generalizable feed-forward models and the 2D diffusion prior, and it requires only multi-view image training, without 3D supervision. DiHuR demonstrates superior performance in both within-dataset and cross-dataset generalization settings, as validated on THuman, ZJU-MoCap, and HuMMan datasets compared to existing methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11903v1-abstract-full').style.display = 'none'; document.getElementById('2411.11903v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to WACV 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11844">arXiv:2411.11844</a> <span> [<a href="https://arxiv.org/pdf/2411.11844">pdf</a>, <a href="https://arxiv.org/format/2411.11844">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Generative World Explorer </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lu%2C+T">Taiming Lu</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+T">Tianmin Shu</a>, <a href="/search/cs?searchtype=author&query=Yuille%2C+A">Alan Yuille</a>, <a href="/search/cs?searchtype=author&query=Khashabi%2C+D">Daniel Khashabi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jieneng Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11844v2-abstract-short" style="display: inline;"> Planning with partial observation is a central challenge in embodied AI. A majority of prior works have tackled this challenge by developing agents that physically explore their environment to update their beliefs about the world state. In contrast, humans can $\textit{imagine}$ unseen parts of the world through a mental exploration and $\textit{revise}$ their beliefs with imagined observations. S… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11844v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11844v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11844v2-abstract-full" style="display: none;"> Planning with partial observation is a central challenge in embodied AI. A majority of prior works have tackled this challenge by developing agents that physically explore their environment to update their beliefs about the world state. In contrast, humans can $\textit{imagine}$ unseen parts of the world through a mental exploration and $\textit{revise}$ their beliefs with imagined observations. Such updated beliefs can allow them to make more informed decisions, without necessitating the physical exploration of the world at all times. To achieve this human-like ability, we introduce the $\textit{Generative World Explorer (Genex)}$, an egocentric world exploration framework that allows an agent to mentally explore a large-scale 3D world (e.g., urban scenes) and acquire imagined observations to update its belief. This updated belief will then help the agent to make a more informed decision at the current step. To train $\textit{Genex}$, we create a synthetic urban scene dataset, Genex-DB. Our experimental results demonstrate that (1) $\textit{Genex}$ can generate high-quality and consistent observations during long-horizon exploration of a large virtual physical world and (2) the beliefs updated with the generated observations can inform an existing decision-making model (e.g., an LLM agent) to make better plans. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11844v2-abstract-full').style.display = 'none'; document.getElementById('2411.11844v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Website: generative-world-explorer.github.io</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11694">arXiv:2411.11694</a> <span> [<a href="https://arxiv.org/pdf/2411.11694">pdf</a>, <a href="https://arxiv.org/format/2411.11694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Technical Report: Enhancing LLM Reasoning with Reward-guided Tree Search </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jinhao Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhipeng Chen</a>, <a href="/search/cs?searchtype=author&query=Min%2C+Y">Yingqian Min</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xiaoxue Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiapeng Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yiru Tang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haoxiang Sun</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jia Deng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W+X">Wayne Xin Zhao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zheng Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+D">Dong Yan</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+J">Jian Xie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+J">Ji-Rong Wen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11694v1-abstract-short" style="display: inline;"> Recently, test-time scaling has garnered significant attention from the research community, largely due to the substantial advancements of the o1 model released by OpenAI. By allocating more computational resources during the inference phase, large language models~(LLMs) can extensively explore the solution space by generating more thought tokens or diverse solutions, thereby producing more accura… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11694v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11694v1-abstract-full" style="display: none;"> Recently, test-time scaling has garnered significant attention from the research community, largely due to the substantial advancements of the o1 model released by OpenAI. By allocating more computational resources during the inference phase, large language models~(LLMs) can extensively explore the solution space by generating more thought tokens or diverse solutions, thereby producing more accurate responses. However, developing an o1-like reasoning approach is challenging, and researchers have been making various attempts to advance this open area of research. In this paper, we present a preliminary exploration into enhancing the reasoning abilities of LLMs through reward-guided tree search algorithms. This framework is implemented by integrating the policy model, reward model, and search algorithm. It is primarily constructed around a tree search algorithm, where the policy model navigates a dynamically expanding tree guided by a specially trained reward model. We thoroughly explore various design considerations necessary for implementing this framework and provide a detailed report of the technical aspects. To assess the effectiveness of our approach, we focus on mathematical reasoning tasks and conduct extensive evaluations on four challenging datasets, significantly enhancing the reasoning abilities of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11694v1-abstract-full').style.display = 'none'; document.getElementById('2411.11694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">LLM;Complex Reasoning;Math</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11647">arXiv:2411.11647</a> <span> [<a href="https://arxiv.org/pdf/2411.11647">pdf</a>, <a href="https://arxiv.org/ps/2411.11647">ps</a>, <a href="https://arxiv.org/format/2411.11647">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> No-regret Exploration in Shuffle Private Reinforcement Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bai%2C+S">Shaojie Bai</a>, <a href="/search/cs?searchtype=author&query=Talebi%2C+M+S">Mohammad Sadegh Talebi</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+C">Chengcheng Zhao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+P">Peng Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiming Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11647v1-abstract-short" style="display: inline;"> Differential privacy (DP) has recently been introduced into episodic reinforcement learning (RL) to formally address user privacy concerns in personalized services. Previous work mainly focuses on two trust models of DP: the central model, where a central agent is responsible for protecting users' sensitive data, and the (stronger) local model, where the protection occurs directly on the user side… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11647v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11647v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11647v1-abstract-full" style="display: none;"> Differential privacy (DP) has recently been introduced into episodic reinforcement learning (RL) to formally address user privacy concerns in personalized services. Previous work mainly focuses on two trust models of DP: the central model, where a central agent is responsible for protecting users' sensitive data, and the (stronger) local model, where the protection occurs directly on the user side. However, they either require a trusted central agent or incur a significantly higher privacy cost, making it unsuitable for many scenarios. This work introduces a trust model stronger than the central model but with a lower privacy cost than the local model, leveraging the emerging \emph{shuffle} model of privacy. We present the first generic algorithm for episodic RL under the shuffle model, where a trusted shuffler randomly permutes a batch of users' data before sending it to the central agent. We then instantiate the algorithm using our proposed shuffle Privatizer, relying on a shuffle private binary summation mechanism. Our analysis shows that the algorithm achieves a near-optimal regret bound comparable to that of the centralized model and significantly outperforms the local model in terms of privacy cost. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11647v1-abstract-full').style.display = 'none'; document.getElementById('2411.11647v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11581">arXiv:2411.11581</a> <span> [<a href="https://arxiv.org/pdf/2411.11581">pdf</a>, <a href="https://arxiv.org/format/2411.11581">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> OASIS: Open Agents Social Interaction Simulations on One Million Agents </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Ziyi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zaibin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zirui Zheng</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuxian Jiang</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Z">Ziyue Gan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhiyu Wang</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zijian Ling</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jinsong Chen</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+M">Martz Ma</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+B">Bowen Dong</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+P">Prateek Gupta</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+S">Shuyue Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+Z">Zhenfei Yin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Guohao Li</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+X">Xu Jia</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lijun Wang</a>, <a href="/search/cs?searchtype=author&query=Ghanem%2C+B">Bernard Ghanem</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+H">Huchuan Lu</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+W">Wanli Ouyang</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+Y">Yu Qiao</a>, <a href="/search/cs?searchtype=author&query=Torr%2C+P">Philip Torr</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+J">Jing Shao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11581v2-abstract-short" style="display: inline;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a parti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11581v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11581v2-abstract-full" style="display: none;"> There has been a growing interest in enhancing rule-based agent-based models (ABMs) for social media platforms (i.e., X, Reddit) with more realistic large language model (LLM) agents, thereby allowing for a more nuanced study of complex systems. As a result, several LLM-based ABMs have been proposed in the past year. While they hold promise, each simulator is specifically designed to study a particular scenario, making it time-consuming and resource-intensive to explore other phenomena using the same ABM. Additionally, these models simulate only a limited number of agents, whereas real-world social media platforms involve millions of users. To this end, we propose OASIS, a generalizable and scalable social media simulator. OASIS is designed based on real-world social media platforms, incorporating dynamically updated environments (i.e., dynamic social networks and post information), diverse action spaces (i.e., following, commenting), and recommendation systems (i.e., interest-based and hot-score-based). Additionally, OASIS supports large-scale user simulations, capable of modeling up to one million users. With these features, OASIS can be easily extended to different social media platforms to study large-scale group phenomena and behaviors. We replicate various social phenomena, including information spreading, group polarization, and herd effects across X and Reddit platforms. Moreover, we provide observations of social phenomena at different agent group scales. We observe that the larger agent group scale leads to more enhanced group dynamics and more diverse and helpful agents' opinions. These findings demonstrate OASIS's potential as a powerful tool for studying complex systems in digital environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11581v2-abstract-full').style.display = 'none'; document.getElementById('2411.11581v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11448">arXiv:2411.11448</a> <span> [<a href="https://arxiv.org/pdf/2411.11448">pdf</a>, <a href="https://arxiv.org/format/2411.11448">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Inflexibility of Adaptive Embedding in Traffic Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongjun Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lingyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Renhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+X">Xuan Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11448v1-abstract-short" style="display: inline;"> Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown significant promise in traffic forecasting by effectively modeling temporal and spatial correlations. However, rapid urbanization in recent years has led to dynamic shifts in traffic patterns and travel demand, posing major challenges for accurate long-term traffic prediction. The generalization capability of ST-GNNs in ext… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11448v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11448v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11448v1-abstract-full" style="display: none;"> Spatiotemporal Graph Neural Networks (ST-GNNs) and Transformers have shown significant promise in traffic forecasting by effectively modeling temporal and spatial correlations. However, rapid urbanization in recent years has led to dynamic shifts in traffic patterns and travel demand, posing major challenges for accurate long-term traffic prediction. The generalization capability of ST-GNNs in extended temporal scenarios and cross-city applications remains largely unexplored. In this study, we evaluate state-of-the-art models on an extended traffic benchmark and observe substantial performance degradation in existing ST-GNNs over time, which we attribute to their limited inductive capabilities. Our analysis reveals that this degradation stems from an inability to adapt to evolving spatial relationships within urban environments. To address this limitation, we reconsider the design of adaptive embeddings and propose a Principal Component Analysis (PCA) embedding approach that enables models to adapt to new scenarios without retraining. We incorporate PCA embeddings into existing ST-GNN and Transformer architectures, achieving marked improvements in performance. Notably, PCA embeddings allow for flexibility in graph structures between training and testing, enabling models trained on one city to perform zero-shot predictions on other cities. This adaptability demonstrates the potential of PCA embeddings in enhancing the robustness and generalization of spatiotemporal models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11448v1-abstract-full').style.display = 'none'; document.getElementById('2411.11448v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11361">arXiv:2411.11361</a> <span> [<a href="https://arxiv.org/pdf/2411.11361">pdf</a>, <a href="https://arxiv.org/format/2411.11361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scalable Autoregressive Monocular Depth Estimation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jinhong Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Dongqi Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiqiang Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wentong Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+D">Danny Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+i">J intai Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jian Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11361v1-abstract-short" style="display: inline;"> This paper proposes a new autoregressive model as an effective and scalable monocular depth estimator. Our idea is simple: We tackle the monocular depth estimation (MDE) task with an autoregressive prediction paradigm, based on two core designs. First, our depth autoregressive model (DAR) treats the depth map of different resolutions as a set of tokens, and conducts the low-to-high resolution auto… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11361v1-abstract-full" style="display: none;"> This paper proposes a new autoregressive model as an effective and scalable monocular depth estimator. Our idea is simple: We tackle the monocular depth estimation (MDE) task with an autoregressive prediction paradigm, based on two core designs. First, our depth autoregressive model (DAR) treats the depth map of different resolutions as a set of tokens, and conducts the low-to-high resolution autoregressive objective with a patch-wise casual mask. Second, our DAR recursively discretizes the entire depth range into more compact intervals, and attains the coarse-to-fine granularity autoregressive objective in an ordinal-regression manner. By coupling these two autoregressive objectives, our DAR establishes new state-of-the-art (SOTA) on KITTI and NYU Depth v2 by clear margins. Further, our scalable approach allows us to scale the model up to 2.0B and achieve the best RMSE of 1.799 on the KITTI dataset (5% improvement) compared to 1.896 by the current SOTA (Depth Anything). DAR further showcases zero-shot generalization ability on unseen datasets. These results suggest that DAR yields superior performance with an autoregressive prediction paradigm, providing a promising approach to equip modern autoregressive large models (e.g., GPT-4o) with depth estimation capabilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11361v1-abstract-full').style.display = 'none'; document.getElementById('2411.11361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11296">arXiv:2411.11296</a> <span> [<a href="https://arxiv.org/pdf/2411.11296">pdf</a>, <a href="https://arxiv.org/format/2411.11296">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Steering Language Model Refusal with Sparse Autoencoders </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=O%27Brien%2C+K">Kyle O'Brien</a>, <a href="/search/cs?searchtype=author&query=Majercak%2C+D">David Majercak</a>, <a href="/search/cs?searchtype=author&query=Fernandes%2C+X">Xavier Fernandes</a>, <a href="/search/cs?searchtype=author&query=Edgar%2C+R">Richard Edgar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingya Chen</a>, <a href="/search/cs?searchtype=author&query=Nori%2C+H">Harsha Nori</a>, <a href="/search/cs?searchtype=author&query=Carignan%2C+D">Dean Carignan</a>, <a href="/search/cs?searchtype=author&query=Horvitz%2C+E">Eric Horvitz</a>, <a href="/search/cs?searchtype=author&query=Poursabzi-Sangde%2C+F">Forough Poursabzi-Sangde</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11296v1-abstract-short" style="display: inline;"> Responsible practices for deploying language models include guiding models to recognize and refuse answering prompts that are considered unsafe, while complying with safe prompts. Achieving such behavior typically requires updating model weights, which is costly and inflexible. We explore opportunities to steering model activations at inference time, which does not require updating weights. Using… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11296v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11296v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11296v1-abstract-full" style="display: none;"> Responsible practices for deploying language models include guiding models to recognize and refuse answering prompts that are considered unsafe, while complying with safe prompts. Achieving such behavior typically requires updating model weights, which is costly and inflexible. We explore opportunities to steering model activations at inference time, which does not require updating weights. Using sparse autoencoders, we identify and steer features in Phi-3 Mini that mediate refusal behavior. We find that feature steering can improve Phi-3 Minis robustness to jailbreak attempts across various harms, including challenging multi-turn attacks. However, we discover that feature steering can adversely affect overall performance on benchmarks. These results suggest that identifying steerable mechanisms for refusal via sparse autoencoders is a promising approach for enhancing language model safety, but that more research is needed to mitigate feature steerings adverse effects on performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11296v1-abstract-full').style.display = 'none'; document.getElementById('2411.11296v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11295">arXiv:2411.11295</a> <span> [<a href="https://arxiv.org/pdf/2411.11295">pdf</a>, <a href="https://arxiv.org/format/2411.11295">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Transcending Language Boundaries: Harnessing LLMs for Low-Resource Language Translation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hui Wang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zihao Wu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yiwei Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huaqin Zhao</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hanqi Jiang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Y">Yi Pan</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yifan Zhou</a>, <a href="/search/cs?searchtype=author&query=Owl%2C+C">Constance Owl</a>, <a href="/search/cs?searchtype=author&query=Zhai%2C+X">Xiaoming Zhai</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+N">Ninghao Liu</a>, <a href="/search/cs?searchtype=author&query=Saunt%2C+C">Claudio Saunt</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11295v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated remarkable success across a wide range of tasks and domains. However, their performance in low-resource language translation, particularly when translating into these languages, remains underexplored. This gap poses significant challenges, as linguistic barriers hinder the cultural preservation and development of minority communities. To address this… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11295v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11295v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11295v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated remarkable success across a wide range of tasks and domains. However, their performance in low-resource language translation, particularly when translating into these languages, remains underexplored. This gap poses significant challenges, as linguistic barriers hinder the cultural preservation and development of minority communities. To address this issue, this paper introduces a novel retrieval-based method that enhances translation quality for low-resource languages by focusing on key terms, which involves translating keywords and retrieving corresponding examples from existing data. To evaluate the effectiveness of this method, we conducted experiments translating from English into three low-resource languages: Cherokee, a critically endangered indigenous language of North America; Tibetan, a historically and culturally significant language in Asia; and Manchu, a language with few remaining speakers. Our comparison with the zero-shot performance of GPT-4o and LLaMA 3.1 405B, highlights the significant challenges these models face when translating into low-resource languages. In contrast, our retrieval-based method shows promise in improving both word-level accuracy and overall semantic understanding by leveraging existing resources more effectively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11295v1-abstract-full').style.display = 'none'; document.getElementById('2411.11295v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10998">arXiv:2411.10998</a> <span> [<a href="https://arxiv.org/pdf/2411.10998">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> Image-Based RKPM for Accessing Failure Mechanisms in Composite Materials </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanran Wang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yichun Tang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+J">Jing Du</a>, <a href="/search/cs?searchtype=author&query=Hillman%2C+M">Mike Hillman</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J+S">J. S. Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10998v1-abstract-short" style="display: inline;"> Stress distributions and the corresponding fracture patterns and evolutions in the microstructures strongly influence the load-carrying capabilities of composite structures. This work introduces an enhanced phase-field fracture model incorporating interface decohesion to simulate fracture propagation and interactions at material interfaces and within the constituents of composite microstructures.… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10998v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10998v1-abstract-full" style="display: none;"> Stress distributions and the corresponding fracture patterns and evolutions in the microstructures strongly influence the load-carrying capabilities of composite structures. This work introduces an enhanced phase-field fracture model incorporating interface decohesion to simulate fracture propagation and interactions at material interfaces and within the constituents of composite microstructures. The proposed method employs an interface-modified reproducing kernel (IM-RK) approximation for handling cross-interface discontinuities constructed from image voxels and guided by Support Vector Machine (SVM) ma-terial classification. The numerical models are directly generated from X-ray microtomography image voxels, guided by SVM using voxel color code information. Additionally, a strain energy-based phase field variable is introduced, eliminating the need to solve coupled field problems. The effectiveness of this method is demonstrated in modeling crack growth both along interfaces and across matrix and inclusion domains and in predicting the corresponding structural-scale mechanical behavior in composite structures. Furthermore, the proposed method has been validated against experimentally observed crack patterns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10998v1-abstract-full').style.display = 'none'; document.getElementById('2411.10998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">16 pages, 9 figures. arXiv admin note: text overlap with arXiv:2305.16402</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10958">arXiv:2411.10958</a> <span> [<a href="https://arxiv.org/pdf/2411.10958">pdf</a>, <a href="https://arxiv.org/format/2411.10958">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Neural and Evolutionary Computing">cs.NE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> SageAttention2 Technical Report: Accurate 4 Bit Attention for Plug-and-play Inference Acceleration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jintao Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Haofeng Huang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pengle Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+J">Jia Wei</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+J">Jun Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianfei Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10958v1-abstract-short" style="display: inline;"> Although quantization for linear layers has been widely used, its application to accelerate the attention process remains limited. SageAttention utilizes 8-bit matrix multiplication, 16-bit matrix multiplication with 16-bit accumulator, and precision-enhancing methods, implementing an accurate and 2x speedup kernel compared to FlashAttention2. To further enhance the efficiency of attention computa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10958v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10958v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10958v1-abstract-full" style="display: none;"> Although quantization for linear layers has been widely used, its application to accelerate the attention process remains limited. SageAttention utilizes 8-bit matrix multiplication, 16-bit matrix multiplication with 16-bit accumulator, and precision-enhancing methods, implementing an accurate and 2x speedup kernel compared to FlashAttention2. To further enhance the efficiency of attention computation while maintaining precision, we propose SageAttention2, which utilizes significantly faster 4-bit matrix multiplication (Matmul) alongside additional precision-enhancing techniques. First, we propose to quantize matrixes $(Q, K)$ to INT4 in a warp-level granularity and quantize matrixes $(\widetilde P, V)$ to FP8. Second, we propose a method to smooth $Q$ and $V$, enhancing the accuracy of attention with INT4 $QK$ and FP8 $PV$. Third, we analyze the quantization accuracy across timesteps and layers, then propose an adaptive quantization method to ensure the end-to-end metrics over various models. The operations per second (OPS) of SageAttention2 surpass FlashAttention2 and xformers by about 3x and 5x on RTX4090, respectively. Comprehensive experiments confirm that our approach incurs negligible end-to-end metrics loss across diverse models, including those for large language processing, image generation, and video generation. The codes are available at https://github.com/thu-ml/SageAttention. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10958v1-abstract-full').style.display = 'none'; document.getElementById('2411.10958v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10739">arXiv:2411.10739</a> <span> [<a href="https://arxiv.org/pdf/2411.10739">pdf</a>, <a href="https://arxiv.org/format/2411.10739">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Signal Processing">eess.SP</span> </div> </div> <p class="title is-5 mathjax"> A Wearable Gait Monitoring System for 17 Gait Parameters Based on Computer Vision </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiangang Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yung-Hong Sun</a>, <a href="/search/cs?searchtype=author&query=Pickett%2C+K">Kristen Pickett</a>, <a href="/search/cs?searchtype=author&query=King%2C+B">Barbara King</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Y+H">Yu Hen Hu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">Hongrui Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10739v1-abstract-short" style="display: inline;"> We developed a shoe-mounted gait monitoring system capable of tracking up to 17 gait parameters, including gait length, step time, stride velocity, and others. The system employs a stereo camera mounted on one shoe to track a marker placed on the opposite shoe, enabling the estimation of spatial gait parameters. Additionally, a Force Sensitive Resistor (FSR) affixed to the heel of the shoe, combin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10739v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10739v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10739v1-abstract-full" style="display: none;"> We developed a shoe-mounted gait monitoring system capable of tracking up to 17 gait parameters, including gait length, step time, stride velocity, and others. The system employs a stereo camera mounted on one shoe to track a marker placed on the opposite shoe, enabling the estimation of spatial gait parameters. Additionally, a Force Sensitive Resistor (FSR) affixed to the heel of the shoe, combined with a custom-designed algorithm, is utilized to measure temporal gait parameters. Through testing on multiple participants and comparison with the gait mat, the proposed gait monitoring system exhibited notable performance, with the accuracy of all measured gait parameters exceeding 93.61%. The system also demonstrated a low drift of 4.89% during long-distance walking. A gait identification task conducted on participants using a trained Transformer model achieved 95.7% accuracy on the dataset collected by the proposed system, demonstrating that our hardware has the potential to collect long-sequence gait data suitable for integration with current Large Language Models (LLMs). The system is cost-effective, user-friendly, and well-suited for real-life measurements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10739v1-abstract-full').style.display = 'none'; document.getElementById('2411.10739v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 14 figures. This paper was submitted for publication to the IEEE Transactions on Instrumentation and Measurement</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10187">arXiv:2411.10187</a> <span> [<a href="https://arxiv.org/pdf/2411.10187">pdf</a>, <a href="https://arxiv.org/format/2411.10187">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Try-On-Adapter: A Simple and Flexible Try-On Paradigm </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+H">Hanzhong Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jianfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Cheng Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jun Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+R">Ruxue Wen</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+P">Pingzhong Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingdong Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Ming Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10187v1-abstract-short" style="display: inline;"> Image-based virtual try-on, widely used in online shopping, aims to generate images of a naturally dressed person conditioned on certain garments, providing significant research and commercial potential. A key challenge of try-on is to generate realistic images of the model wearing the garments while preserving the details of the garments. Previous methods focus on masking certain parts of the ori… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10187v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10187v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10187v1-abstract-full" style="display: none;"> Image-based virtual try-on, widely used in online shopping, aims to generate images of a naturally dressed person conditioned on certain garments, providing significant research and commercial potential. A key challenge of try-on is to generate realistic images of the model wearing the garments while preserving the details of the garments. Previous methods focus on masking certain parts of the original model's standing image, and then inpainting on masked areas to generate realistic images of the model wearing corresponding reference garments, which treat the try-on task as an inpainting task. However, such implements require the user to provide a complete, high-quality standing image, which is user-unfriendly in practical applications. In this paper, we propose Try-On-Adapter (TOA), an outpainting paradigm that differs from the existing inpainting paradigm. Our TOA can preserve the given face and garment, naturally imagine the rest parts of the image, and provide flexible control ability with various conditions, e.g., garment properties and human pose. In the experiments, TOA shows excellent performance on the virtual try-on task even given relatively low-quality face and garment images in qualitative comparisons. Additionally, TOA achieves the state-of-the-art performance of FID scores 5.56 and 7.23 for paired and unpaired on the VITON-HD dataset in quantitative comparisons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10187v1-abstract-full').style.display = 'none'; document.getElementById('2411.10187v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Image virtual try-on, 7 pages, 3 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10169">arXiv:2411.10169</a> <span> [<a href="https://arxiv.org/pdf/2411.10169">pdf</a>, <a href="https://arxiv.org/format/2411.10169">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Definition and Detection of Centralization Defects in Smart Contracts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zewei Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiachi Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jiajing Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Weizhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zibin Zheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10169v1-abstract-short" style="display: inline;"> In recent years, security incidents stemming from centralization defects in smart contracts have led to substantial financial losses. A centralization defect refers to any error, flaw, or fault in a smart contract's design or development stage that introduces a single point of failure. Such defects allow a specific account or user to disrupt the normal operations of smart contracts, potentially ca… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10169v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10169v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10169v1-abstract-full" style="display: none;"> In recent years, security incidents stemming from centralization defects in smart contracts have led to substantial financial losses. A centralization defect refers to any error, flaw, or fault in a smart contract's design or development stage that introduces a single point of failure. Such defects allow a specific account or user to disrupt the normal operations of smart contracts, potentially causing malfunctions or even complete project shutdowns. Despite the significance of this issue, most current smart contract analyses overlook centralization defects, focusing primarily on other types of defects. To address this gap, our paper introduces six types of centralization defects in smart contracts by manually analyzing 597 Stack Exchange posts and 117 audit reports. For each defect, we provide a detailed description and code examples to illustrate its characteristics and potential impacts. Additionally, we introduce a tool named CDRipper (Centralization Defects Ripper) designed to identify the defined centralization defects. Specifically, CDRipper constructs a permission dependency graph (PDG) and extracts the permission dependencies of functions from the source code of smart contracts. It then detects the sensitive operations in functions and identifies centralization defects based on predefined patterns. We conduct a large-scale experiment using CDRipper on 244,424 real-world smart contracts and evaluate the results based on a manually labeled dataset. Our findings reveal that 82,446 contracts contain at least one of the six centralization defects, with our tool achieving an overall precision of 93.7%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10169v1-abstract-full').style.display = 'none'; document.getElementById('2411.10169v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10137">arXiv:2411.10137</a> <span> [<a href="https://arxiv.org/pdf/2411.10137">pdf</a>, <a href="https://arxiv.org/format/2411.10137">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Legal Evalutions and Challenges of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiaqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Huan Zhao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhenyuan Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+P">Peng Shu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junhao Chen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+H">Haobo Sun</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruixi Liang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shixin Li</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+P">Pengcheng Shi</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Longjun Ma</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zongjia Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Z">Zhengliang Liu</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+T">Tianyang Zhong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yutong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+C">Chong Ma</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+T">Tianli Ding</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+Y">Yudan Ren</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+T">Tianming Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xi Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10137v1-abstract-short" style="display: inline;"> In this paper, we review legal testing methods based on Large Language Models (LLMs), using the OPENAI o1 model as a case study to evaluate the performance of large models in applying legal provisions. We compare current state-of-the-art LLMs, including open-source, closed-source, and legal-specific models trained specifically for the legal domain. Systematic tests are conducted on English and Chi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10137v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10137v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10137v1-abstract-full" style="display: none;"> In this paper, we review legal testing methods based on Large Language Models (LLMs), using the OPENAI o1 model as a case study to evaluate the performance of large models in applying legal provisions. We compare current state-of-the-art LLMs, including open-source, closed-source, and legal-specific models trained specifically for the legal domain. Systematic tests are conducted on English and Chinese legal cases, and the results are analyzed in depth. Through systematic testing of legal cases from common law systems and China, this paper explores the strengths and weaknesses of LLMs in understanding and applying legal texts, reasoning through legal issues, and predicting judgments. The experimental results highlight both the potential and limitations of LLMs in legal applications, particularly in terms of challenges related to the interpretation of legal language and the accuracy of legal reasoning. Finally, the paper provides a comprehensive analysis of the advantages and disadvantages of various types of models, offering valuable insights and references for the future application of AI in the legal field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10137v1-abstract-full').style.display = 'none'; document.getElementById('2411.10137v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09952">arXiv:2411.09952</a> <span> [<a href="https://arxiv.org/pdf/2411.09952">pdf</a>, <a href="https://arxiv.org/format/2411.09952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3696409.3700241">10.1145/3696409.3700241 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> GGAvatar: Reconstructing Garment-Separated 3D Gaussian Splatting Avatars from Monocular Video </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingxuan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09952v1-abstract-short" style="display: inline;"> Avatar modelling has broad applications in human animation and virtual try-ons. Recent advancements in this field have focused on high-quality and comprehensive human reconstruction but often overlook the separation of clothing from the body. To bridge this gap, this paper introduces GGAvatar (Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular videos. Through advanced param… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09952v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09952v1-abstract-full" style="display: none;"> Avatar modelling has broad applications in human animation and virtual try-ons. Recent advancements in this field have focused on high-quality and comprehensive human reconstruction but often overlook the separation of clothing from the body. To bridge this gap, this paper introduces GGAvatar (Garment-separated 3D Gaussian Splatting Avatar), which relies on monocular videos. Through advanced parameterized templates and unique phased training, this model effectively achieves decoupled, editable, and realistic reconstruction of clothed humans. Comparative evaluations with other costly models confirm GGAvatar's superior quality and efficiency in modelling both clothed humans and separable garments. The paper also showcases applications in clothing editing, as illustrated in Figure 1, highlighting the model's benefits and the advantages of effective disentanglement. The code is available at https://github.com/J-X-Chen/GGAvatar/. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09952v1-abstract-full').style.display = 'none'; document.getElementById('2411.09952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">MMAsia'24 Accepted</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09807">arXiv:2411.09807</a> <span> [<a href="https://arxiv.org/pdf/2411.09807">pdf</a>, <a href="https://arxiv.org/format/2411.09807">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating Loss Landscapes from a Topology Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tiankai Xie</a>, <a href="/search/cs?searchtype=author&query=Geniesse%2C+C">Caleb Geniesse</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiaqing Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yaoqing Yang</a>, <a href="/search/cs?searchtype=author&query=Morozov%2C+D">Dmitriy Morozov</a>, <a href="/search/cs?searchtype=author&query=Mahoney%2C+M+W">Michael W. Mahoney</a>, <a href="/search/cs?searchtype=author&query=Maciejewski%2C+R">Ross Maciejewski</a>, <a href="/search/cs?searchtype=author&query=Weber%2C+G+H">Gunther H. Weber</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09807v1-abstract-short" style="display: inline;"> Characterizing the loss of a neural network with respect to model parameters, i.e., the loss landscape, can provide valuable insights into properties of that model. Various methods for visualizing loss landscapes have been proposed, but less emphasis has been placed on quantifying and extracting actionable and reproducible insights from these complex representations. Inspired by powerful tools fro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09807v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09807v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09807v1-abstract-full" style="display: none;"> Characterizing the loss of a neural network with respect to model parameters, i.e., the loss landscape, can provide valuable insights into properties of that model. Various methods for visualizing loss landscapes have been proposed, but less emphasis has been placed on quantifying and extracting actionable and reproducible insights from these complex representations. Inspired by powerful tools from topological data analysis (TDA) for summarizing the structure of high-dimensional data, here we characterize the underlying shape (or topology) of loss landscapes, quantifying the topology to reveal new insights about neural networks. To relate our findings to the machine learning (ML) literature, we compute simple performance metrics (e.g., accuracy, error), and we characterize the local structure of loss landscapes using Hessian-based metrics (e.g., largest eigenvalue, trace, eigenvalue spectral density). Following this approach, we study established models from image pattern recognition (e.g., ResNets) and scientific ML (e.g., physics-informed neural networks), and we show how quantifying the shape of loss landscapes can provide new insights into model performance and learning dynamics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09807v1-abstract-full').style.display = 'none'; document.getElementById('2411.09807v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09618">arXiv:2411.09618</a> <span> [<a href="https://arxiv.org/pdf/2411.09618">pdf</a>, <a href="https://arxiv.org/format/2411.09618">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.59275/j.melba.2024-9c68">10.59275/j.melba.2024-9c68 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> MICCAI-CDMRI 2023 QuantConn Challenge Findings on Achieving Robust Quantitative Connectivity through Harmonized Preprocessing of Diffusion MRI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Newlin%2C+N+R">Nancy R. Newlin</a>, <a href="/search/cs?searchtype=author&query=Schilling%2C+K">Kurt Schilling</a>, <a href="/search/cs?searchtype=author&query=Koudoro%2C+S">Serge Koudoro</a>, <a href="/search/cs?searchtype=author&query=Chandio%2C+B+Q">Bramsh Qamar Chandio</a>, <a href="/search/cs?searchtype=author&query=Kanakaraj%2C+P">Praitayini Kanakaraj</a>, <a href="/search/cs?searchtype=author&query=Moyer%2C+D">Daniel Moyer</a>, <a href="/search/cs?searchtype=author&query=Kelly%2C+C+E">Claire E. Kelly</a>, <a href="/search/cs?searchtype=author&query=Genc%2C+S">Sila Genc</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jian Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J+Y">Joseph Yuan-Mou Yang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Ye Wu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yifei He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiawei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Q">Qingrun Zeng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fan Zhang</a>, <a href="/search/cs?searchtype=author&query=Adluru%2C+N">Nagesh Adluru</a>, <a href="/search/cs?searchtype=author&query=Nath%2C+V">Vishwesh Nath</a>, <a href="/search/cs?searchtype=author&query=Pathak%2C+S">Sudhir Pathak</a>, <a href="/search/cs?searchtype=author&query=Schneider%2C+W">Walter Schneider</a>, <a href="/search/cs?searchtype=author&query=Gade%2C+A">Anurag Gade</a>, <a href="/search/cs?searchtype=author&query=Rathi%2C+Y">Yogesh Rathi</a>, <a href="/search/cs?searchtype=author&query=Hendriks%2C+T">Tom Hendriks</a>, <a href="/search/cs?searchtype=author&query=Vilanova%2C+A">Anna Vilanova</a>, <a href="/search/cs?searchtype=author&query=Chamberland%2C+M">Maxime Chamberland</a>, <a href="/search/cs?searchtype=author&query=Pieciak%2C+T">Tomasz Pieciak</a> , et al. (11 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09618v1-abstract-short" style="display: inline;"> White matter alterations are increasingly implicated in neurological diseases and their progression. International-scale studies use diffusion-weighted magnetic resonance imaging (DW-MRI) to qualitatively identify changes in white matter microstructure and connectivity. Yet, quantitative analysis of DW-MRI data is hindered by inconsistencies stemming from varying acquisition protocols. There is a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09618v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09618v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09618v1-abstract-full" style="display: none;"> White matter alterations are increasingly implicated in neurological diseases and their progression. International-scale studies use diffusion-weighted magnetic resonance imaging (DW-MRI) to qualitatively identify changes in white matter microstructure and connectivity. Yet, quantitative analysis of DW-MRI data is hindered by inconsistencies stemming from varying acquisition protocols. There is a pressing need to harmonize the preprocessing of DW-MRI datasets to ensure the derivation of robust quantitative diffusion metrics across acquisitions. In the MICCAI-CDMRI 2023 QuantConn challenge, participants were provided raw data from the same individuals collected on the same scanner but with two different acquisitions and tasked with preprocessing the DW-MRI to minimize acquisition differences while retaining biological variation. Submissions are evaluated on the reproducibility and comparability of cross-acquisition bundle-wise microstructure measures, bundle shape features, and connectomics. The key innovations of the QuantConn challenge are that (1) we assess bundles and tractography in the context of harmonization for the first time, (2) we assess connectomics in the context of harmonization for the first time, and (3) we have 10x additional subjects over prior harmonization challenge, MUSHAC and 100x over SuperMUDI. We find that bundle surface area, fractional anisotropy, connectome assortativity, betweenness centrality, edge count, modularity, nodal strength, and participation coefficient measures are most biased by acquisition and that machine learning voxel-wise correction, RISH mapping, and NeSH methods effectively reduce these biases. In addition, microstructure measures AD, MD, RD, bundle length, connectome density, efficiency, and path length are least biased by these acquisition differences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09618v1-abstract-full').style.display = 'none'; document.getElementById('2411.09618v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted for publication at the Journal of Machine Learning for Biomedical Imaging (MELBA) https://melba-journal.org/2024/019</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Machine.Learning.for.Biomedical.Imaging. 2 (2024) </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09540">arXiv:2411.09540</a> <span> [<a href="https://arxiv.org/pdf/2411.09540">pdf</a>, <a href="https://arxiv.org/format/2411.09540">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Prompting the Unseen: Detecting Hidden Backdoors in Black-Box Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zi-Xuan Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jia-Wei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhi-Peng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+C">Chia-Mu Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09540v1-abstract-short" style="display: inline;"> Visual prompting (VP) is a new technique that adapts well-trained frozen models for source domain tasks to target domain tasks. This study examines VP's benefits for black-box model-level backdoor detection. The visual prompt in VP maps class subspaces between source and target domains. We identify a misalignment, termed class subspace inconsistency, between clean and poisoned datasets. Based on t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09540v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09540v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09540v1-abstract-full" style="display: none;"> Visual prompting (VP) is a new technique that adapts well-trained frozen models for source domain tasks to target domain tasks. This study examines VP's benefits for black-box model-level backdoor detection. The visual prompt in VP maps class subspaces between source and target domains. We identify a misalignment, termed class subspace inconsistency, between clean and poisoned datasets. Based on this, we introduce \textsc{BProm}, a black-box model-level detection method to identify backdoors in suspicious models, if any. \textsc{BProm} leverages the low classification accuracy of prompted models when backdoors are present. Extensive experiments confirm \textsc{BProm}'s effectiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09540v1-abstract-full').style.display = 'none'; document.getElementById('2411.09540v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08715">arXiv:2411.08715</a> <span> [<a href="https://arxiv.org/pdf/2411.08715">pdf</a>, <a href="https://arxiv.org/format/2411.08715">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Retrieval Augmented Recipe Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+G">Guoshan Liu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+H">Hailong Yin</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bin Zhu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jingjing Chen</a>, <a href="/search/cs?searchtype=author&query=Ngo%2C+C">Chong-Wah Ngo</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yu-Gang Jiang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08715v1-abstract-short" style="display: inline;"> Given the potential applications of generating recipes from food images, this area has garnered significant attention from researchers in recent years. Existing works for recipe generation primarily utilize a two-stage training method, first generating ingredients and then obtaining instructions from both the image and ingredients. Large Multi-modal Models (LMMs), which have achieved notable succe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08715v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08715v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08715v1-abstract-full" style="display: none;"> Given the potential applications of generating recipes from food images, this area has garnered significant attention from researchers in recent years. Existing works for recipe generation primarily utilize a two-stage training method, first generating ingredients and then obtaining instructions from both the image and ingredients. Large Multi-modal Models (LMMs), which have achieved notable success across a variety of vision and language tasks, shed light to generating both ingredients and instructions directly from images. Nevertheless, LMMs still face the common issue of hallucinations during recipe generation, leading to suboptimal performance. To tackle this, we propose a retrieval augmented large multimodal model for recipe generation. We first introduce Stochastic Diversified Retrieval Augmentation (SDRA) to retrieve recipes semantically related to the image from an existing datastore as a supplement, integrating them into the prompt to add diverse and rich context to the input image. Additionally, Self-Consistency Ensemble Voting mechanism is proposed to determine the most confident prediction recipes as the final output. It calculates the consistency among generated recipe candidates, which use different retrieval recipes as context for generation. Extensive experiments validate the effectiveness of our proposed method, which demonstrates state-of-the-art (SOTA) performance in recipe generation tasks on the Recipe1M dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08715v1-abstract-full').style.display = 'none'; document.getElementById('2411.08715v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ACCEPT on IEEE/CVF Winter Conference on Applications of Computer Vision (WACV) 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08464">arXiv:2411.08464</a> <span> [<a href="https://arxiv.org/pdf/2411.08464">pdf</a>, <a href="https://arxiv.org/format/2411.08464">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> </div> </div> <p class="title is-5 mathjax"> Crystal Structure Generation Based On Material Properties </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chao Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">JiaHui Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+H">HongRui Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">ChunYan Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Chen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08464v1-abstract-short" style="display: inline;"> The discovery of new materials is very important to the field of materials science. When researchers explore new materials, they often have expected performance requirements for their crystal structure. In recent years, data-driven methods have made great progress in the direction plane of crystal structure generation, but there is still a lack of methods that can effectively map material properti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08464v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08464v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08464v1-abstract-full" style="display: none;"> The discovery of new materials is very important to the field of materials science. When researchers explore new materials, they often have expected performance requirements for their crystal structure. In recent years, data-driven methods have made great progress in the direction plane of crystal structure generation, but there is still a lack of methods that can effectively map material properties to crystal structure. In this paper, we propose a Crystal DiT model to generate the crystal structure from the expected material properties by embedding the material properties and combining the symmetry information predicted by the large language model. Experimental verification shows that our proposed method has good performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08464v1-abstract-full').style.display = 'none'; document.getElementById('2411.08464v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08370">arXiv:2411.08370</a> <span> [<a href="https://arxiv.org/pdf/2411.08370">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Fuzzy Reinforcement LSTM-based Long-term Prediction Model for Fault Conditions in Nuclear Power Plants </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siwei Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Jiayan Fang</a>, <a href="/search/cs?searchtype=author&query=Wua%2C+Y">Yichun Wua</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wei Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengxin Li</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiangwen Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08370v1-abstract-short" style="display: inline;"> Early fault detection and timely maintenance scheduling can significantly mitigate operational risks in NPPs and enhance the reliability of operator decision-making. Therefore, it is necessary to develop an efficient Prognostics and Health Management (PHM) multi-step prediction model for predicting of system health status and prompt execution of maintenance operations. In this study, we propose a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08370v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08370v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08370v1-abstract-full" style="display: none;"> Early fault detection and timely maintenance scheduling can significantly mitigate operational risks in NPPs and enhance the reliability of operator decision-making. Therefore, it is necessary to develop an efficient Prognostics and Health Management (PHM) multi-step prediction model for predicting of system health status and prompt execution of maintenance operations. In this study, we propose a novel predictive model that integrates reinforcement learning with Long Short-Term Memory (LSTM) neural networks and the Expert Fuzzy Evaluation Method. The model is validated using parameter data for 20 different breach sizes in the Main Steam Line Break (MSLB) accident condition of the CPR1000 pressurized water reactor simulation model and it demonstrates a remarkable capability in accurately forecasting NPP parameter changes up to 128 steps ahead (with a time interval of 10 seconds per step, i.e., 1280 seconds), thereby satisfying the temporal advance requirement for fault prognostics in NPPs. Furthermore, this method provides an effective reference solution for PHM applications such as anomaly detection and remaining useful life prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08370v1-abstract-full').style.display = 'none'; document.getElementById('2411.08370v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07841">arXiv:2411.07841</a> <span> [<a href="https://arxiv.org/pdf/2411.07841">pdf</a>, <a href="https://arxiv.org/format/2411.07841">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Federated Learning for Discrete Optimal Transport with Large Population under Incomplete Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kaur%2C+N">Navpreet Kaur</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Juntao Chen</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Y">Yingdong Lu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07841v1-abstract-short" style="display: inline;"> Optimal transport is a powerful framework for the efficient allocation of resources between sources and targets. However, traditional models often struggle to scale effectively in the presence of large and heterogeneous populations. In this work, we introduce a discrete optimal transport framework designed to handle large-scale, heterogeneous target populations, characterized by type distributions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07841v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07841v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07841v1-abstract-full" style="display: none;"> Optimal transport is a powerful framework for the efficient allocation of resources between sources and targets. However, traditional models often struggle to scale effectively in the presence of large and heterogeneous populations. In this work, we introduce a discrete optimal transport framework designed to handle large-scale, heterogeneous target populations, characterized by type distributions. We address two scenarios: one where the type distribution of targets is known, and one where it is unknown. For the known distribution, we propose a fully distributed algorithm to achieve optimal resource allocation. In the case of unknown distribution, we develop a federated learning-based approach that enables efficient computation of the optimal transport scheme while preserving privacy. Case studies are provided to evaluate the performance of our learning algorithm. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07841v1-abstract-full').style.display = 'none'; document.getElementById('2411.07841v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07763">arXiv:2411.07763</a> <span> [<a href="https://arxiv.org/pdf/2411.07763">pdf</a>, <a href="https://arxiv.org/format/2411.07763">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Databases">cs.DB</span> </div> </div> <p class="title is-5 mathjax"> Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+F">Fangyu Lei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jixuan Chen</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yuxiao Ye</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+R">Ruisheng Cao</a>, <a href="/search/cs?searchtype=author&query=Shin%2C+D">Dongchan Shin</a>, <a href="/search/cs?searchtype=author&query=Su%2C+H">Hongjin Su</a>, <a href="/search/cs?searchtype=author&query=Suo%2C+Z">Zhaoqing Suo</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+H">Hongcheng Gao</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+W">Wenjing Hu</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+P">Pengcheng Yin</a>, <a href="/search/cs?searchtype=author&query=Zhong%2C+V">Victor Zhong</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+C">Caiming Xiong</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+R">Ruoxi Sun</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qian Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sida Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07763v1-abstract-short" style="display: inline;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spide… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07763v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07763v1-abstract-full" style="display: none;"> Real-world enterprise text-to-SQL workflows often involve complex cloud or local data across various database systems, multiple SQL queries in various dialects, and diverse operations from data transformation to analytics. We introduce Spider 2.0, an evaluation framework comprising 632 real-world text-to-SQL workflow problems derived from enterprise-level database use cases. The databases in Spider 2.0 are sourced from real data applications, often containing over 1,000 columns and stored in local or cloud database systems such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 frequently requires understanding and searching through database metadata, dialect documentation, and even project-level codebases. This challenge calls for models to interact with complex SQL workflow environments, process extremely long contexts, perform intricate reasoning, and generate multiple SQL queries with diverse operations, often exceeding 100 lines, which goes far beyond traditional text-to-SQL challenges. Our evaluations indicate that based on o1-preview, our code agent framework successfully solves only 17.0% of the tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on Spider 2.0 show that while language models have demonstrated remarkable performance in code generation -- especially in prior text-to-SQL benchmarks -- they require significant improvement in order to achieve adequate performance for real-world enterprise usage. Progress on Spider 2.0 represents crucial steps towards developing intelligent, autonomous, code agents for real-world enterprise settings. Our code, baseline models, and data are available at https://spider2-sql.github.io. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07763v1-abstract-full').style.display = 'none'; document.getElementById('2411.07763v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07541">arXiv:2411.07541</a> <span> [<a href="https://arxiv.org/pdf/2411.07541">pdf</a>, <a href="https://arxiv.org/format/2411.07541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HiCoM: Hierarchical Coherent Motion for Streamable Dynamic Scene with 3D Gaussian Splatting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+Q">Qiankun Gao</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+J">Jiarui Meng</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+C">Chengxiang Wen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jie Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07541v1-abstract-short" style="display: inline;"> The online reconstruction of dynamic scenes from multi-view streaming videos faces significant challenges in training, rendering and storage efficiency. Harnessing superior learning speed and real-time rendering capabilities, 3D Gaussian Splatting (3DGS) has recently demonstrated considerable potential in this field. However, 3DGS can be inefficient in terms of storage and prone to overfitting by… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07541v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07541v1-abstract-full" style="display: none;"> The online reconstruction of dynamic scenes from multi-view streaming videos faces significant challenges in training, rendering and storage efficiency. Harnessing superior learning speed and real-time rendering capabilities, 3D Gaussian Splatting (3DGS) has recently demonstrated considerable potential in this field. However, 3DGS can be inefficient in terms of storage and prone to overfitting by excessively growing Gaussians, particularly with limited views. This paper proposes an efficient framework, dubbed HiCoM, with three key components. First, we construct a compact and robust initial 3DGS representation using a perturbation smoothing strategy. Next, we introduce a Hierarchical Coherent Motion mechanism that leverages the inherent non-uniform distribution and local consistency of 3D Gaussians to swiftly and accurately learn motions across frames. Finally, we continually refine the 3DGS with additional Gaussians, which are later merged into the initial 3DGS to maintain consistency with the evolving scene. To preserve a compact representation, an equivalent number of low-opacity Gaussians that minimally impact the representation are removed before processing subsequent frames. Extensive experiments conducted on two widely used datasets show that our framework improves learning efficiency of the state-of-the-art methods by about $20\%$ and reduces the data storage by $85\%$, achieving competitive free-viewpoint video synthesis quality but with higher robustness and stability. Moreover, by parallel learning multiple frames simultaneously, our HiCoM decreases the average training wall time to $<2$ seconds per frame with negligible performance degradation, substantially boosting real-world applicability and responsiveness. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07541v1-abstract-full').style.display = 'none'; document.getElementById('2411.07541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024; Code is avaliable at https://github.com/gqk/HiCoM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07527">arXiv:2411.07527</a> <span> [<a href="https://arxiv.org/pdf/2411.07527">pdf</a>, <a href="https://arxiv.org/format/2411.07527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.24963/ijcai.2024/707">10.24963/ijcai.2024/707 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Prompt-enhanced Network for Hateful Meme Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junxi Liu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yanyan Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiehai Chen</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Y">Yun Xue</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fenghuan Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07527v1-abstract-short" style="display: inline;"> The dynamic expansion of social media has led to an inundation of hateful memes on media platforms, accentuating the growing need for efficient identification and removal. Acknowledging the constraints of conventional multimodal hateful meme classification, which heavily depends on external knowledge and poses the risk of including irrelevant or redundant content, we developed Pen -- a prompt-enha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07527v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07527v1-abstract-full" style="display: none;"> The dynamic expansion of social media has led to an inundation of hateful memes on media platforms, accentuating the growing need for efficient identification and removal. Acknowledging the constraints of conventional multimodal hateful meme classification, which heavily depends on external knowledge and poses the risk of including irrelevant or redundant content, we developed Pen -- a prompt-enhanced network framework based on the prompt learning approach. Specifically, after constructing the sequence through the prompt method and encoding it with a language model, we performed region information global extraction on the encoded sequence for multi-view perception. By capturing global information about inference instances and demonstrations, Pen facilitates category selection by fully leveraging sequence information. This approach significantly improves model classification accuracy. Additionally, to bolster the model's reasoning capabilities in the feature space, we introduced prompt-aware contrastive learning into the framework to improve the quality of sample feature distributions. Through extensive ablation experiments on two public datasets, we evaluate the effectiveness of the Pen framework, concurrently comparing it with state-of-the-art model baselines. Our research findings highlight that Pen surpasses manual prompt methods, showcasing superior generalization and classification accuracy in hateful meme classification tasks. Our code is available at https://github.com/juszzi/Pen. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07527v1-abstract-full').style.display = 'none'; document.getElementById('2411.07527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence Main Track. Pages 6397-6405</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07503">arXiv:2411.07503</a> <span> [<a href="https://arxiv.org/pdf/2411.07503">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Medical Physics">physics.med-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Tissues and Organs">q-bio.TO</span> </div> </div> <p class="title is-5 mathjax"> A Novel Automatic Real-time Motion Tracking Method for Magnetic Resonance Imaging-guided Radiotherapy: Leveraging the Enhanced Tracking-Learning-Detection Framework with Automatic Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shengqi Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zilin Wang</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+J">Jianrong Dai</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+S">Shirui Qin</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Ying Cao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+R">Ruiao Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jiayun Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+G">Guohua Wu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yuan Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07503v1-abstract-short" style="display: inline;"> Objective: Ensuring the precision in motion tracking for MRI-guided Radiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This study refined the motion tracking accuracy in MRIgRT through the innovation of an automatic real-time tracking method, leveraging an enhanced Tracking-Learning-Detection (ETLD) framework coupled with automatic segmentation. Methods: We developed a nove… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07503v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07503v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07503v1-abstract-full" style="display: none;"> Objective: Ensuring the precision in motion tracking for MRI-guided Radiotherapy (MRIgRT) is crucial for the delivery of effective treatments. This study refined the motion tracking accuracy in MRIgRT through the innovation of an automatic real-time tracking method, leveraging an enhanced Tracking-Learning-Detection (ETLD) framework coupled with automatic segmentation. Methods: We developed a novel MRIgRT motion tracking method by integrating two primary methods: the ETLD framework and an improved Chan-Vese model (ICV), named ETLD+ICV. The TLD framework was upgraded to suit real-time cine MRI, including advanced image preprocessing, no-reference image quality assessment, an enhanced median-flow tracker, and a refined detector with dynamic search region adjustments. Additionally, ICV was combined for precise coverage of the target volume, which refined the segmented region frame by frame using tracking results, with key parameters optimized. Tested on 3.5D MRI scans from 10 patients with liver metastases, our method ensures precise tracking and accurate segmentation vital for MRIgRT. Results: An evaluation of 106,000 frames across 77 treatment fractions revealed sub-millimeter tracking errors of less than 0.8mm, with over 99% precision and 98% recall for all subjects, underscoring the robustness and efficacy of the ETLD. Moreover, the ETLD+ICV yielded a dice global score of more than 82% for all subjects, demonstrating the proposed method's extensibility and precise target volume coverage. Conclusions: This study successfully developed an automatic real-time motion tracking method for MRIgRT that markedly surpasses current methods. The novel method not only delivers exceptional precision in tracking and segmentation but also demonstrates enhanced adaptability to clinical demands, positioning it as an indispensable asset in the quest to augment the efficacy of radiotherapy treatments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07503v1-abstract-full').style.display = 'none'; document.getElementById('2411.07503v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07498">arXiv:2411.07498</a> <span> [<a href="https://arxiv.org/pdf/2411.07498">pdf</a>, <a href="https://arxiv.org/format/2411.07498">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Semantic Sleuth: Identifying Ponzi Contracts via Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+C">Cong Wu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jing Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziwei Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+R">Ruichao Liang</a>, <a href="/search/cs?searchtype=author&query=Du%2C+R">Ruiying Du</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07498v1-abstract-short" style="display: inline;"> Smart contracts, self-executing agreements directly encoded in code, are fundamental to blockchain technology, especially in decentralized finance (DeFi) and Web3. However, the rise of Ponzi schemes in smart contracts poses significant risks, leading to substantial financial losses and eroding trust in blockchain systems. Existing detection methods, such as PonziGuard, depend on large amounts of l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07498v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07498v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07498v1-abstract-full" style="display: none;"> Smart contracts, self-executing agreements directly encoded in code, are fundamental to blockchain technology, especially in decentralized finance (DeFi) and Web3. However, the rise of Ponzi schemes in smart contracts poses significant risks, leading to substantial financial losses and eroding trust in blockchain systems. Existing detection methods, such as PonziGuard, depend on large amounts of labeled data and struggle to identify unseen Ponzi schemes, limiting their reliability and generalizability. In contrast, we introduce PonziSleuth, the first LLM-driven approach for detecting Ponzi smart contracts, which requires no labeled training data. PonziSleuth utilizes advanced language understanding capabilities of LLMs to analyze smart contract source code through a novel two-step zero-shot chain-of-thought prompting technique. Our extensive evaluation on benchmark datasets and real-world contracts demonstrates that PonziSleuth delivers comparable, and often superior, performance without the extensive data requirements, achieving a balanced detection accuracy of 96.06% with GPT-3.5-turbo, 93.91% with LLAMA3, and 94.27% with Mistral. In real-world detection, PonziSleuth successfully identified 15 new Ponzi schemes from 4,597 contracts verified by Etherscan in March 2024, with a false negative rate of 0% and a false positive rate of 0.29%. These results highlight PonziSleuth's capability to detect diverse and novel Ponzi schemes, marking a significant advancement in leveraging LLMs for enhancing blockchain security and mitigating financial scams. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07498v1-abstract-full').style.display = 'none'; document.getElementById('2411.07498v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository