CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 751 results for author: <span class="mathjax">Cheng, J</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Cheng%2C+J">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Cheng, J"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Cheng%2C+J&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Cheng, J"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17335">arXiv:2411.17335</a> <span> [<a href="https://arxiv.org/pdf/2411.17335">pdf</a>, <a href="https://arxiv.org/format/2411.17335">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MotionLLaMA: A Unified Framework for Motion Synthesis and Comprehension </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zeyu Ling</a>, <a href="/search/cs?searchtype=author&query=Han%2C+B">Bo Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+S">Shiyang Li</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+H">Hongdeng Shen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+C">Changqing Zou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17335v1-abstract-short" style="display: inline;"> This paper introduces MotionLLaMA, a unified framework for motion synthesis and comprehension, along with a novel full-body motion tokenizer called the HoMi Tokenizer. MotionLLaMA is developed based on three core principles. First, it establishes a powerful unified representation space through the HoMi Tokenizer. Using a single codebook, the HoMi Tokenizer in MotionLLaMA achieves reconstruction ac… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17335v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17335v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17335v1-abstract-full" style="display: none;"> This paper introduces MotionLLaMA, a unified framework for motion synthesis and comprehension, along with a novel full-body motion tokenizer called the HoMi Tokenizer. MotionLLaMA is developed based on three core principles. First, it establishes a powerful unified representation space through the HoMi Tokenizer. Using a single codebook, the HoMi Tokenizer in MotionLLaMA achieves reconstruction accuracy comparable to residual vector quantization tokenizers utilizing six codebooks, outperforming all existing single-codebook tokenizers. Second, MotionLLaMA integrates a large language model to tackle various motion-related tasks. This integration bridges various modalities, facilitating both comprehensive and intricate motion synthesis and comprehension. Third, MotionLLaMA introduces the MotionHub dataset, currently the most extensive multimodal, multitask motion dataset, which enables fine-tuning of large language models. Extensive experimental results demonstrate that MotionLLaMA not only covers the widest range of motion-related tasks but also achieves state-of-the-art (SOTA) performance in motion completion, interaction dual-person text-to-motion, and all comprehension tasks while reaching performance comparable to SOTA in the remaining tasks. The code and MotionHub dataset are publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17335v1-abstract-full').style.display = 'none'; document.getElementById('2411.17335v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17253">arXiv:2411.17253</a> <span> [<a href="https://arxiv.org/pdf/2411.17253">pdf</a>, <a href="https://arxiv.org/format/2411.17253">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> LHPF: Look back the History and Plan for the Future in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+Y">Yao Tian</a>, <a href="/search/cs?searchtype=author&query=Mei%2C+X">Xiaodong Mei</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+G">Ge Sun</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+F">Fulong Ma</a>, <a href="/search/cs?searchtype=author&query=Sander%2C+P+V">Pedro V. Sander</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Junwei Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17253v1-abstract-short" style="display: inline;"> Decision-making and planning in autonomous driving critically reflect the safety of the system, making effective planning imperative. Current imitation learning-based planning algorithms often merge historical trajectories with present observations to predict future candidate paths. However, these algorithms typically assess the current and historical plans independently, leading to discontinuitie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17253v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17253v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17253v1-abstract-full" style="display: none;"> Decision-making and planning in autonomous driving critically reflect the safety of the system, making effective planning imperative. Current imitation learning-based planning algorithms often merge historical trajectories with present observations to predict future candidate paths. However, these algorithms typically assess the current and historical plans independently, leading to discontinuities in driving intentions and an accumulation of errors with each step in a discontinuous plan. To tackle this challenge, this paper introduces LHPF, an imitation learning planner that integrates historical planning information. Our approach employs a historical intention aggregation module that pools historical planning intentions, which are then combined with a spatial query vector to decode the final planning trajectory. Furthermore, we incorporate a comfort auxiliary task to enhance the human-like quality of the driving behavior. Extensive experiments using both real-world and synthetic data demonstrate that LHPF not only surpasses existing advanced learning-based planners in planning performance but also marks the first instance of a purely learning-based planner outperforming the expert. Additionally, the application of the historical intention aggregation module across various backbones highlights the considerable potential of the proposed method. The code will be made publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17253v1-abstract-full').style.display = 'none'; document.getElementById('2411.17253v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15538">arXiv:2411.15538</a> <span> [<a href="https://arxiv.org/pdf/2411.15538">pdf</a>, <a href="https://arxiv.org/format/2411.15538">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> That Flick is Sick: Gyroscope Integration in Xbox Controllers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J+E">Jhervey Edric Cheng</a>, <a href="/search/cs?searchtype=author&query=Kalaw%2C+S+S">Stacy Selena Kalaw</a>, <a href="/search/cs?searchtype=author&query=Kok%2C+J+P">James Patrick Kok</a>, <a href="/search/cs?searchtype=author&query=Meneses%2C+A+Y">Alyssa Ysabelle Meneses</a>, <a href="/search/cs?searchtype=author&query=Sy%2C+R">Richard Sy</a>, <a href="/search/cs?searchtype=author&query=Deja%2C+J+A">Jordan Aiko Deja</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15538v1-abstract-short" style="display: inline;"> Gyroscope integration in Xbox controllers offers new possibilities for enhancing gaming experiences, particularly in first-person shooter (FPS) games. To investigate its potential, we conducted an empirical study with 11 participants, comparing aim precision and reaction times across three input methods: a computer mouse, a standard Xbox controller, and a gyroscope-enabled controller. Participants… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15538v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15538v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15538v1-abstract-full" style="display: none;"> Gyroscope integration in Xbox controllers offers new possibilities for enhancing gaming experiences, particularly in first-person shooter (FPS) games. To investigate its potential, we conducted an empirical study with 11 participants, comparing aim precision and reaction times across three input methods: a computer mouse, a standard Xbox controller, and a gyroscope-enabled controller. Participants completed an aim training task, revealing the mouse as the most accurate device, followed by the standard controller. Interestingly, the gyroscope-enabled controller showed reduced accuracy and slower reaction times, attributed to challenges in sensitivity and control. Participant feedback highlighted areas for improvement, including refined sensitivity settings, control stability, and software design. These findings underscore the need for design innovations, such as camera rotation limits and optimized sensitivity thresholds, to make gyroscope-enabled controllers more competitive. Future work should consider diverse gamer profiles and extended evaluation contexts to better understand the role of gyroscopes in gaming interfaces. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15538v1-abstract-full').style.display = 'none'; document.getElementById('2411.15538v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, 1 snippet, 11 references</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> Proceedings of CHIRP 2024: Transforming HCI Research in the Philippines Workshop </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14696">arXiv:2411.14696</a> <span> [<a href="https://arxiv.org/pdf/2411.14696">pdf</a>, <a href="https://arxiv.org/format/2411.14696">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Quantum Hamiltonian Descent for Graph Partition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jinglei Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+R">Ruilin Zhou</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+Y">Yuhang Gan</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyu Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14696v1-abstract-short" style="display: inline;"> We introduce Quantum Hamiltonian Descent as a novel approach to solve the graph partition problem. By reformulating graph partition as a Quadratic Unconstrained Binary Optimization (QUBO) problem, we leverage QHD's quantum-inspired dynamics to identify optimal community structures. Our method implements a multi-level refinement strategy that alternates between QUBO formulation and QHD optimization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14696v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14696v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14696v1-abstract-full" style="display: none;"> We introduce Quantum Hamiltonian Descent as a novel approach to solve the graph partition problem. By reformulating graph partition as a Quadratic Unconstrained Binary Optimization (QUBO) problem, we leverage QHD's quantum-inspired dynamics to identify optimal community structures. Our method implements a multi-level refinement strategy that alternates between QUBO formulation and QHD optimization to iteratively improve partition quality. Experimental results demonstrate that our QHD-based approach achieves superior modularity scores (up to 5.49\%) improvement with reduced computational overhead compared to traditional optimization methods. This work establishes QHD as an effective quantum-inspired framework for tackling graph partition challenges in large-scale networks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14696v1-abstract-full').style.display = 'none'; document.getElementById('2411.14696v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13997">arXiv:2411.13997</a> <span> [<a href="https://arxiv.org/pdf/2411.13997">pdf</a>, <a href="https://arxiv.org/format/2411.13997">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Mirror Target YOLO: An Improved YOLOv8 Method with Indirect Vision for Heritage Buildings Fire Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jian Liang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">JunSheng Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13997v1-abstract-short" style="display: inline;"> Fires can cause severe damage to heritage buildings, making timely fire detection essential. Traditional dense cabling and drilling can harm these structures, so reducing the number of cameras to minimize such impact is challenging. Additionally, avoiding false alarms due to noise sensitivity and preserving the expertise of managers in fire-prone areas is crucial. To address these needs, we propos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13997v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13997v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13997v1-abstract-full" style="display: none;"> Fires can cause severe damage to heritage buildings, making timely fire detection essential. Traditional dense cabling and drilling can harm these structures, so reducing the number of cameras to minimize such impact is challenging. Additionally, avoiding false alarms due to noise sensitivity and preserving the expertise of managers in fire-prone areas is crucial. To address these needs, we propose a fire detection method based on indirect vision, called Mirror Target YOLO (MITA-YOLO). MITA-YOLO integrates indirect vision deployment and an enhanced detection module. It uses mirror angles to achieve indirect views, solving issues with limited visibility in irregular spaces and aligning each indirect view with the target monitoring area. The Target-Mask module is designed to automatically identify and isolate the indirect vision areas in each image, filtering out non-target areas. This enables the model to inherit managers' expertise in assessing fire-risk zones, improving focus and resistance to interference in fire detection.In our experiments, we created an 800-image fire dataset with indirect vision. Results show that MITA-YOLO significantly reduces camera requirements while achieving superior detection performance compared to other mainstream models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13997v1-abstract-full').style.display = 'none'; document.getElementById('2411.13997v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13908">arXiv:2411.13908</a> <span> [<a href="https://arxiv.org/pdf/2411.13908">pdf</a>, <a href="https://arxiv.org/format/2411.13908">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Hybrid Physics-ML Modeling for Marine Vehicle Maneuvering Motions in the Presence of Environmental Disturbances </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zihao Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Liang Xu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+L">Lizhu Hao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yan Peng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13908v1-abstract-short" style="display: inline;"> A hybrid physics-machine learning modeling framework is proposed for the surface vehicles' maneuvering motions to address the modeling capability and stability in the presence of environmental disturbances. From a deep learning perspective, the framework is based on a variant version of residual networks with additional feature extraction. Initially, an imperfect physical model is derived and iden… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13908v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13908v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13908v1-abstract-full" style="display: none;"> A hybrid physics-machine learning modeling framework is proposed for the surface vehicles' maneuvering motions to address the modeling capability and stability in the presence of environmental disturbances. From a deep learning perspective, the framework is based on a variant version of residual networks with additional feature extraction. Initially, an imperfect physical model is derived and identified to capture the fundamental hydrodynamic characteristics of marine vehicles. This model is then integrated with a feedforward network through a residual block. Additionally, feature extraction from trigonometric transformations is employed in the machine learning component to account for the periodic influence of currents and waves. The proposed method is evaluated using real navigational data from the 'JH7500' unmanned surface vehicle. The results demonstrate the robust generalizability and accurate long-term prediction capabilities of the nonlinear dynamic model in specific environmental conditions. This approach has the potential to be extended and applied to develop a comprehensive high-fidelity simulator. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13908v1-abstract-full').style.display = 'none'; document.getElementById('2411.13908v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12814">arXiv:2411.12814</a> <span> [<a href="https://arxiv.org/pdf/2411.12814">pdf</a>, <a href="https://arxiv.org/format/2411.12814">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Interactive Medical Image Segmentation: A Benchmark Dataset and Baseline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junlong Cheng</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+B">Bin Fu</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jin Ye</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guoan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianbin Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Ruoyu Li</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+H">He Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Junren Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jingwen Li</a>, <a href="/search/cs?searchtype=author&query=Su%2C+Y">Yanzhou Su</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+M">Min Zhu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Junjun He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12814v2-abstract-short" style="display: inline;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 m… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'inline'; document.getElementById('2411.12814v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12814v2-abstract-full" style="display: none;"> Interactive Medical Image Segmentation (IMIS) has long been constrained by the limited availability of large-scale, diverse, and densely annotated datasets, which hinders model generalization and consistent evaluation across different models. In this paper, we introduce the IMed-361M benchmark dataset, a significant advancement in general IMIS research. First, we collect and standardize over 6.4 million medical images and their corresponding ground truth masks from multiple data sources. Then, leveraging the strong object recognition capabilities of a vision foundational model, we automatically generated dense interactive masks for each image and ensured their quality through rigorous quality control and granularity management. Unlike previous datasets, which are limited by specific modalities or sparse annotations, IMed-361M spans 14 modalities and 204 segmentation targets, totaling 361 million masks-an average of 56 masks per image. Finally, we developed an IMIS baseline network on this dataset that supports high-quality mask generation through interactive inputs, including clicks, bounding boxes, text prompts, and their combinations. We evaluate its performance on medical image segmentation tasks from multiple perspectives, demonstrating superior accuracy and scalability compared to existing interactive segmentation models. To facilitate research on foundational models in medical computer vision, we release the IMed-361M and model at https://github.com/uni-medical/IMIS-Bench. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12814v2-abstract-full').style.display = 'none'; document.getElementById('2411.12814v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11396">arXiv:2411.11396</a> <span> [<a href="https://arxiv.org/pdf/2411.11396">pdf</a>, <a href="https://arxiv.org/format/2411.11396">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stacking Brick by Brick: Aligned Feature Isolation for Incremental Face Forgery Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+L">Li Hao</a>, <a href="/search/cs?searchtype=author&query=Ai%2C+J">Jiaxin Ai</a>, <a href="/search/cs?searchtype=author&query=Zou%2C+Q">Qin Zou</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11396v2-abstract-short" style="display: inline;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgerie… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'inline'; document.getElementById('2411.11396v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11396v2-abstract-full" style="display: none;"> The rapid advancement of face forgery techniques has introduced a growing variety of forgeries. Incremental Face Forgery Detection (IFFD), involving gradually adding new forgery data to fine-tune the previously trained model, has been introduced as a promising strategy to deal with evolving forgery methods. However, a naively trained IFFD model is prone to catastrophic forgetting when new forgeries are integrated, as treating all forgeries as a single ''Fake" class in the Real/Fake classification can cause different forgery types overriding one another, thereby resulting in the forgetting of unique characteristics from earlier tasks and limiting the model's effectiveness in learning forgery specificity and generality. In this paper, we propose to stack the latent feature distributions of previous and new tasks brick by brick, $\textit{i.e.}$, achieving $\textbf{aligned feature isolation}$. In this manner, we aim to preserve learned forgery information and accumulate new knowledge by minimizing distribution overriding, thereby mitigating catastrophic forgetting. To achieve this, we first introduce Sparse Uniform Replay (SUR) to obtain the representative subsets that could be treated as the uniformly sparse versions of the previous global distributions. We then propose a Latent-space Incremental Detector (LID) that leverages SUR data to isolate and align distributions. For evaluation, we construct a more advanced and comprehensive benchmark tailored for IFFD. The leading experimental results validate the superiority of our method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11396v2-abstract-full').style.display = 'none'; document.getElementById('2411.11396v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10752">arXiv:2411.10752</a> <span> [<a href="https://arxiv.org/pdf/2411.10752">pdf</a>, <a href="https://arxiv.org/format/2411.10752">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards a Comprehensive Benchmark for Pathological Lymph Node Metastasis in Breast Cancer Sections </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yuanyuan Lei</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiawen Li</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junru Cheng</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenting Huang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+J">Jian Guan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10752v1-abstract-short" style="display: inline;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10752v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10752v1-abstract-full" style="display: none;"> Advances in optical microscopy scanning have significantly contributed to computational pathology (CPath) by converting traditional histopathological slides into whole slide images (WSIs). This development enables comprehensive digital reviews by pathologists and accelerates AI-driven diagnostic support for WSI analysis. Recent advances in foundational pathology models have increased the need for benchmarking tasks. The Camelyon series is one of the most widely used open-source datasets in computational pathology. However, the quality, accessibility, and clinical relevance of the labels have not been comprehensively evaluated. In this study, we reprocessed 1,399 WSIs and labels from the Camelyon-16 and Camelyon-17 datasets, removing low-quality slides, correcting erroneous labels, and providing expert pixel annotations for tumor regions in the previously unreleased test set. Based on the sizes of re-annotated tumor regions, we upgraded the binary cancer screening task to a four-class task: negative, micro-metastasis, macro-metastasis, and Isolated Tumor Cells (ITC). We reevaluated pre-trained pathology feature extractors and multiple instance learning (MIL) methods using the cleaned dataset, providing a benchmark that advances AI development in histopathology. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10752v1-abstract-full').style.display = 'none'; document.getElementById('2411.10752v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10060">arXiv:2411.10060</a> <span> [<a href="https://arxiv.org/pdf/2411.10060">pdf</a>, <a href="https://arxiv.org/format/2411.10060">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CMATH: Cross-Modality Augmented Transformer with Hierarchical Variational Distillation for Multimodal Emotion Recognition in Conversation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+X">Xiaofei Zhu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiawei Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Z">Zhou Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qingyang Wang</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+J">Jianfeng Yao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10060v1-abstract-short" style="display: inline;"> Multimodal emotion recognition in conversation (MER) aims to accurately identify emotions in conversational utterances by integrating multimodal information. Previous methods usually treat multimodal information as equal quality and employ symmetric architectures to conduct multimodal fusion. However, in reality, the quality of different modalities usually varies considerably, and utilizing a symm… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10060v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10060v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10060v1-abstract-full" style="display: none;"> Multimodal emotion recognition in conversation (MER) aims to accurately identify emotions in conversational utterances by integrating multimodal information. Previous methods usually treat multimodal information as equal quality and employ symmetric architectures to conduct multimodal fusion. However, in reality, the quality of different modalities usually varies considerably, and utilizing a symmetric architecture is difficult to accurately recognize conversational emotions when dealing with uneven modal information. Furthermore, fusing multi-modality information in a single granularity may fail to adequately integrate modal information, exacerbating the inaccuracy in emotion recognition. In this paper, we propose a novel Cross-Modality Augmented Transformer with Hierarchical Variational Distillation, called CMATH, which consists of two major components, i.e., Multimodal Interaction Fusion and Hierarchical Variational Distillation. The former is comprised of two submodules, including Modality Reconstruction and Cross-Modality Augmented Transformer (CMA-Transformer), where Modality Reconstruction focuses on obtaining high-quality compressed representation of each modality, and CMA-Transformer adopts an asymmetric fusion strategy which treats one modality as the central modality and takes others as auxiliary modalities. The latter first designs a variational fusion network to fuse the fine-grained representations learned by CMA- Transformer into a coarse-grained representations. Then, it introduces a hierarchical distillation framework to maintain the consistency between modality representations with different granularities. Experiments on the IEMOCAP and MELD datasets demonstrate that our proposed model outperforms previous state-of-the-art baselines. Implementation codes can be available at https://github.com/ cjw-MER/CMATH. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10060v1-abstract-full').style.display = 'none'; document.getElementById('2411.10060v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09694">arXiv:2411.09694</a> <span> [<a href="https://arxiv.org/pdf/2411.09694">pdf</a>, <a href="https://arxiv.org/format/2411.09694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> A Bayesian Optimization Approach to Machine Translation Reranking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Julius Cheng</a>, <a href="/search/cs?searchtype=author&query=Z%C3%BCfle%2C+M">Maike Z眉fle</a>, <a href="/search/cs?searchtype=author&query=Zouhar%2C+V">Vil茅m Zouhar</a>, <a href="/search/cs?searchtype=author&query=Vlachos%2C+A">Andreas Vlachos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09694v1-abstract-short" style="display: inline;"> Reranking a list of candidates from a machine translation system with an external scoring model and returning the highest-scoring candidate remains a simple and effective method for improving the overall output quality. Translation scoring models continue to grow in size, with the best models being comparable to generation models. Thus, reranking can add substantial computational cost to the trans… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09694v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09694v1-abstract-full" style="display: none;"> Reranking a list of candidates from a machine translation system with an external scoring model and returning the highest-scoring candidate remains a simple and effective method for improving the overall output quality. Translation scoring models continue to grow in size, with the best models being comparable to generation models. Thus, reranking can add substantial computational cost to the translation pipeline. In this work, we pose reranking as a Bayesian optimization (BayesOpt) problem. By strategically selecting candidates to score based on a balance of exploration and exploitation, we show that it is possible to find top-scoring candidates when scoring only a fraction of the candidate list. For instance, our method achieves the same CometKiwi score using only 70 scoring evaluations compared a baseline system using 180. We present a multi-fidelity setting for BayesOpt, where the candidates are first scored with a cheaper but noisier proxy scoring model, which further improves the cost-performance tradeoff when using smaller but well-trained distilled proxy scorers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09694v1-abstract-full').style.display = 'none'; document.getElementById('2411.09694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">v1: Preprint version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09224">arXiv:2411.09224</a> <span> [<a href="https://arxiv.org/pdf/2411.09224">pdf</a>, <a href="https://arxiv.org/format/2411.09224">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Programming with AI: Evaluating ChatGPT, Gemini, AlphaCode, and GitHub Copilot for Programmers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Siam%2C+M+K">Md Kamrul Siam</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+H">Huanying Gu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J+Q">Jerry Q. Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09224v1-abstract-short" style="display: inline;"> Our everyday lives now heavily rely on artificial intelligence (AI) powered large language models (LLMs). Like regular users, programmers are also benefiting from the newest large language models. In response to the critical role that AI models play in modern software development, this study presents a thorough evaluation of leading programming assistants, including ChatGPT, Gemini(Bard AI), Alpha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09224v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09224v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09224v1-abstract-full" style="display: none;"> Our everyday lives now heavily rely on artificial intelligence (AI) powered large language models (LLMs). Like regular users, programmers are also benefiting from the newest large language models. In response to the critical role that AI models play in modern software development, this study presents a thorough evaluation of leading programming assistants, including ChatGPT, Gemini(Bard AI), AlphaCode, and GitHub Copilot. The evaluation is based on tasks like natural language processing and code generation accuracy in different programming languages like Java, Python and C++. Based on the results, it has emphasized their strengths and weaknesses and the importance of further modifications to increase the reliability and accuracy of the latest popular models. Although these AI assistants illustrate a high level of progress in language understanding and code generation, along with ethical considerations and responsible usage, they provoke a necessity for discussion. With time, developing more refined AI technology is essential for achieving advanced solutions in various fields, especially with the knowledge of the feature intricacies of these models and their implications. This study offers a comparison of different LLMs and provides essential feedback on the rapidly changing area of AI models. It also emphasizes the need for ethical developmental practices to actualize AI models' full potential. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09224v1-abstract-full').style.display = 'none'; document.getElementById('2411.09224v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06444">arXiv:2411.06444</a> <span> [<a href="https://arxiv.org/pdf/2411.06444">pdf</a>, <a href="https://arxiv.org/format/2411.06444">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> </div> </div> <p class="title is-5 mathjax"> SamRobNODDI: Q-Space Sampling-Augmented Continuous Representation Learning for Robust and Generalized NODDI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Taohui Xiao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a>, <a href="/search/cs?searchtype=author&query=Fan%2C+W">Wenxin Fan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+E">Enqing Dong</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Hairong Zheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shanshan Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06444v1-abstract-short" style="display: inline;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06444v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06444v1-abstract-full" style="display: none;"> Neurite Orientation Dispersion and Density Imaging (NODDI) microstructure estimation from diffusion magnetic resonance imaging (dMRI) is of great significance for the discovery and treatment of various neurological diseases. Current deep learning-based methods accelerate the speed of NODDI parameter estimation and improve the accuracy. However, most methods require the number and coordinates of gradient directions during testing and training to remain strictly consistent, significantly limiting the generalization and robustness of these models in NODDI parameter estimation. In this paper, we propose a q-space sampling augmentation-based continuous representation learning framework (SamRobNODDI) to achieve robust and generalized NODDI. Specifically, a continuous representation learning method based on q-space sampling augmentation is introduced to fully explore the information between different gradient directions in q-space. Furthermore, we design a sampling consistency loss to constrain the outputs of different sampling schemes, ensuring that the outputs remain as consistent as possible, thereby further enhancing performance and robustness to varying q-space sampling schemes. SamRobNODDI is also a flexible framework that can be applied to different backbone networks. To validate the effectiveness of the proposed method, we compared it with 7 state-of-the-art methods across 18 different q-space sampling schemes, demonstrating that the proposed SamRobNODDI has better performance, robustness, generalization, and flexibility. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06444v1-abstract-full').style.display = 'none'; document.getElementById('2411.06444v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01779">arXiv:2411.01779</a> <span> [<a href="https://arxiv.org/pdf/2411.01779">pdf</a>, <a href="https://arxiv.org/format/2411.01779">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> TabSec: A Collaborative Framework for Novel Insider Threat Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zilin Huang</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiangyan Tang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongyu Li</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xinyi Cao</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jieren Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01779v1-abstract-short" style="display: inline;"> In the era of the Internet of Things (IoT) and data sharing, users frequently upload their personal information to enterprise databases to enjoy enhanced service experiences provided by various online services. However, the widespread presence of system vulnerabilities, remote network intrusions, and insider threats significantly increases the exposure of private enterprise data on the internet. I… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01779v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01779v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01779v1-abstract-full" style="display: none;"> In the era of the Internet of Things (IoT) and data sharing, users frequently upload their personal information to enterprise databases to enjoy enhanced service experiences provided by various online services. However, the widespread presence of system vulnerabilities, remote network intrusions, and insider threats significantly increases the exposure of private enterprise data on the internet. If such data is stolen or leaked by attackers, it can result in severe asset losses and business operation disruptions. To address these challenges, this paper proposes a novel threat detection framework, TabITD. This framework integrates Intrusion Detection Systems (IDS) with User and Entity Behavior Analytics (UEBA) strategies to form a collaborative detection system that bridges the gaps in existing systems' capabilities. It effectively addresses the blurred boundaries between external and insider threats caused by the diversification of attack methods, thereby enhancing the model's learning ability and overall detection performance. Moreover, the proposed method leverages the TabNet architecture, which employs a sparse attention feature selection mechanism that allows TabNet to select the most relevant features at each decision step, thereby improving the detection of rare-class attacks. We evaluated our proposed solution on two different datasets, achieving average accuracies of 96.71% and 97.25%, respectively. The results demonstrate that this approach can effectively detect malicious behaviors such as masquerade attacks and external threats, significantly enhancing network security defenses and the efficiency of network attack detection. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01779v1-abstract-full').style.display = 'none'; document.getElementById('2411.01779v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00476">arXiv:2411.00476</a> <span> [<a href="https://arxiv.org/pdf/2411.00476">pdf</a>, <a href="https://arxiv.org/format/2411.00476">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> PlanScope: Learning to Plan Within Decision Scope Does Matter </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+R">Ren Xin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jun Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00476v1-abstract-short" style="display: inline;"> In the context of autonomous driving, learning-based methods have been promising for the development of planning modules. During the training process of planning modules, directly minimizing the discrepancy between expert-driving logs and planning output is widely deployed. In general, driving logs consist of suddenly appearing obstacles or swiftly changing traffic signals, which typically necessi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00476v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00476v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00476v1-abstract-full" style="display: none;"> In the context of autonomous driving, learning-based methods have been promising for the development of planning modules. During the training process of planning modules, directly minimizing the discrepancy between expert-driving logs and planning output is widely deployed. In general, driving logs consist of suddenly appearing obstacles or swiftly changing traffic signals, which typically necessitate swift and nuanced adjustments in driving maneuvers. Concurrently, future trajectories of the vehicles exhibit their long-term decisions, such as adhering to a reference lane or circumventing stationary obstacles. Due to the unpredictable influence of future events in driving logs, reasoning bias could be naturally introduced to learning based planning modules, which leads to a possible degradation of driving performance. To address this issue, we identify the decisions and their corresponding time horizons, and characterize a so-called decision scope by retaining decisions within derivable horizons only, to mitigate the effect of irrational behaviors caused by unpredictable events. This framework employs wavelet transformation based log preprocessing with an effective loss computation approach, rendering the planning model only sensitive to valuable decisions at the current state. Since frequency domain characteristics are extracted in conjunction with time domain features by wavelets, decision information across various frequency bands within the corresponding time horizon can be suitably captured. Furthermore, to achieve valuable decision learning, this framework leverages a transformer based decoder that incrementally generates the detailed profiles of future decisions over multiple steps. Our experiments demonstrate that our proposed method outperforms baselines in terms of driving scores with closed-loop evaluations on the nuPlan dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00476v1-abstract-full').style.display = 'none'; document.getElementById('2411.00476v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23857">arXiv:2410.23857</a> <span> [<a href="https://arxiv.org/pdf/2410.23857">pdf</a>, <a href="https://arxiv.org/format/2410.23857">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> ECDQC: Efficient Compilation for Distributed Quantum Computing with Linear Layout </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+K">Kecheng Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yidong Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Haochen Luo</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+L">Lingjun Xiong</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuchen Zhu</a>, <a href="/search/cs?searchtype=author&query=Casey%2C+E">Eilis Casey</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jinglei Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiding Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23857v2-abstract-short" style="display: inline;"> In this paper, we propose an efficient compilation method for distributed quantum computing (DQC) using the Linear Nearest Neighbor (LNN) architecture. By exploiting the LNN topology's symmetry, we optimize quantum circuit compilation for High Local Connectivity, Sparse Full Connectivity (HLC-SFC) algorithms like Quantum Approximate Optimization Algorithm (QAOA) and Quantum Fourier Transform (QFT)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23857v2-abstract-full').style.display = 'inline'; document.getElementById('2410.23857v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23857v2-abstract-full" style="display: none;"> In this paper, we propose an efficient compilation method for distributed quantum computing (DQC) using the Linear Nearest Neighbor (LNN) architecture. By exploiting the LNN topology's symmetry, we optimize quantum circuit compilation for High Local Connectivity, Sparse Full Connectivity (HLC-SFC) algorithms like Quantum Approximate Optimization Algorithm (QAOA) and Quantum Fourier Transform (QFT). We also utilize dangling qubits to minimize non-local interactions and reduce SWAP gates. Our approach significantly decreases compilation time, gate count, and circuit depth, improving scalability and robustness for large-scale quantum computations. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23857v2-abstract-full').style.display = 'none'; document.getElementById('2410.23857v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20034">arXiv:2410.20034</a> <span> [<a href="https://arxiv.org/pdf/2410.20034">pdf</a>, <a href="https://arxiv.org/format/2410.20034">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> Sensor2Text: Enabling Natural Language Interactions for Daily Activity Tracking Using Wearable Sensors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wenqiang Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiaxuan Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Leyao Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+W">Wei Zhao</a>, <a href="/search/cs?searchtype=author&query=Matusik%2C+W">Wojciech Matusik</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20034v1-abstract-short" style="display: inline;"> Visual Question-Answering, a technology that generates textual responses from an image and natural language question, has progressed significantly. Notably, it can aid in tracking and inquiring about daily activities, crucial in healthcare monitoring, especially for elderly patients or those with memory disabilities. However, video poses privacy concerns and has a limited field of view. This paper… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20034v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20034v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20034v1-abstract-full" style="display: none;"> Visual Question-Answering, a technology that generates textual responses from an image and natural language question, has progressed significantly. Notably, it can aid in tracking and inquiring about daily activities, crucial in healthcare monitoring, especially for elderly patients or those with memory disabilities. However, video poses privacy concerns and has a limited field of view. This paper presents Sensor2Text, a model proficient in tracking daily activities and engaging in conversations using wearable sensors. The approach outlined here tackles several challenges, including low information density in wearable sensor data, insufficiency of single wearable sensors in human activities recognition, and model's limited capacity for Question-Answering and interactive conversations. To resolve these obstacles, transfer learning and student-teacher networks are utilized to leverage knowledge from visual-language models. Additionally, an encoder-decoder neural network model is devised to jointly process language and sensor data for conversational purposes. Furthermore, Large Language Models are also utilized to enable interactive capabilities. The model showcases the ability to identify human activities and engage in Q\&A dialogues using various wearable sensor modalities. It performs comparably to or better than existing visual-language models in both captioning and conversational tasks. To our knowledge, this represents the first model capable of conversing about wearable sensor data, offering an innovative approach to daily activity tracking that addresses privacy and field-of-view limitations associated with current vision-based solutions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20034v1-abstract-full').style.display = 'none'; document.getElementById('2410.20034v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18537">arXiv:2410.18537</a> <span> [<a href="https://arxiv.org/pdf/2410.18537">pdf</a>, <a href="https://arxiv.org/format/2410.18537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Beyond Color and Lines: Zero-Shot Style-Specific Image Variations with Coordinated Semantics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+J">Jinghao Hu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhe Zhang</a>, <a href="/search/cs?searchtype=author&query=Geng%2C+G">GuoHua Geng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Liuyuxin Yang</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+J">JiaRui Yan</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jingtao Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">YaDong Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+K">Kang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18537v1-abstract-short" style="display: inline;"> Traditionally, style has been primarily considered in terms of artistic elements such as colors, brushstrokes, and lighting. However, identical semantic subjects, like people, boats, and houses, can vary significantly across different artistic traditions, indicating that style also encompasses the underlying semantics. Therefore, in this study, we propose a zero-shot scheme for image variation wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18537v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18537v1-abstract-full" style="display: none;"> Traditionally, style has been primarily considered in terms of artistic elements such as colors, brushstrokes, and lighting. However, identical semantic subjects, like people, boats, and houses, can vary significantly across different artistic traditions, indicating that style also encompasses the underlying semantics. Therefore, in this study, we propose a zero-shot scheme for image variation with coordinated semantics. Specifically, our scheme transforms the image-to-image problem into an image-to-text-to-image problem. The image-to-text operation employs vision-language models e.g., BLIP) to generate text describing the content of the input image, including the objects and their positions. Subsequently, the input style keyword is elaborated into a detailed description of this style and then merged with the content text using the reasoning capabilities of ChatGPT. Finally, the text-to-image operation utilizes a Diffusion model to generate images based on the text prompt. To enable the Diffusion model to accommodate more styles, we propose a fine-tuning strategy that injects text and style constraints into cross-attention. This ensures that the output image exhibits similar semantics in the desired style. To validate the performance of the proposed scheme, we constructed a benchmark comprising images of various styles and scenes and introduced two novel metrics. Despite its simplicity, our scheme yields highly plausible results in a zero-shot manner, particularly for generating stylized images with high-fidelity semantics. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18537v1-abstract-full').style.display = 'none'; document.getElementById('2410.18537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages,6 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18487">arXiv:2410.18487</a> <span> [<a href="https://arxiv.org/pdf/2410.18487">pdf</a>, <a href="https://arxiv.org/format/2410.18487">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Graph Pre-Training Models Are Strong Anomaly Detectors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiashun Cheng</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+Z">Zinan Zheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jianheng Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongwei Wang</a>, <a href="/search/cs?searchtype=author&query=Rong%2C+Y">Yu Rong</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a>, <a href="/search/cs?searchtype=author&query=Tsung%2C+F">Fugee Tsung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18487v1-abstract-short" style="display: inline;"> Graph Anomaly Detection (GAD) is a challenging and practical research topic where Graph Neural Networks (GNNs) have recently shown promising results. The effectiveness of existing GNNs in GAD has been mainly attributed to the simultaneous learning of node representations and the classifier in an end-to-end manner. Meanwhile, graph pre-training, the two-stage learning paradigm such as DGI and Graph… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18487v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18487v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18487v1-abstract-full" style="display: none;"> Graph Anomaly Detection (GAD) is a challenging and practical research topic where Graph Neural Networks (GNNs) have recently shown promising results. The effectiveness of existing GNNs in GAD has been mainly attributed to the simultaneous learning of node representations and the classifier in an end-to-end manner. Meanwhile, graph pre-training, the two-stage learning paradigm such as DGI and GraphMAE, has shown potential in leveraging unlabeled graph data to enhance downstream tasks, yet its impact on GAD remains under-explored. In this work, we show that graph pre-training models are strong graph anomaly detectors. Specifically, we demonstrate that pre-training is highly competitive, markedly outperforming the state-of-the-art end-to-end training models when faced with limited supervision. To understand this phenomenon, we further uncover pre-training enhances the detection of distant, under-represented, unlabeled anomalies that go beyond 2-hop neighborhoods of known anomalies, shedding light on its superior performance against end-to-end models. Moreover, we extend our examination to the potential of pre-training in graph-level anomaly detection. We envision this work to stimulate a re-evaluation of pre-training's role in GAD and offer valuable insights for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18487v1-abstract-full').style.display = 'none'; document.getElementById('2410.18487v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17521">arXiv:2410.17521</a> <span> [<a href="https://arxiv.org/pdf/2410.17521">pdf</a>, <a href="https://arxiv.org/format/2410.17521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diffusion Priors for Variational Likelihood Estimation and Image Denoising </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jun Cheng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+S">Shan Tan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17521v1-abstract-short" style="display: inline;"> Real-world noise removal is crucial in low-level computer vision. Due to the remarkable generation capabilities of diffusion models, recent attention has shifted towards leveraging diffusion priors for image restoration tasks. However, existing diffusion priors-based methods either consider simple noise types or rely on approximate posterior estimation, limiting their effectiveness in addressing s… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17521v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17521v1-abstract-full" style="display: none;"> Real-world noise removal is crucial in low-level computer vision. Due to the remarkable generation capabilities of diffusion models, recent attention has shifted towards leveraging diffusion priors for image restoration tasks. However, existing diffusion priors-based methods either consider simple noise types or rely on approximate posterior estimation, limiting their effectiveness in addressing structured and signal-dependent noise commonly found in real-world images. In this paper, we build upon diffusion priors and propose adaptive likelihood estimation and MAP inference during the reverse diffusion process to tackle real-world noise. We introduce an independent, non-identically distributed likelihood combined with the noise precision (inverse variance) prior and dynamically infer the precision posterior using variational Bayes during the generation process. Meanwhile, we rectify the estimated noise variance through local Gaussian convolution. The final denoised image is obtained by propagating intermediate MAP solutions that balance the updated likelihood and diffusion prior. Additionally, we explore the local diffusion prior inherent in low-resolution diffusion models, enabling direct handling of high-resolution noisy images. Extensive experiments and analyses on diverse real-world datasets demonstrate the effectiveness of our method. Code is available at https://github.com/HUST-Tan/DiffusionVI. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17521v1-abstract-full').style.display = 'none'; document.getElementById('2410.17521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS2024 as Spotlight</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16687">arXiv:2410.16687</a> <span> [<a href="https://arxiv.org/pdf/2410.16687">pdf</a>, <a href="https://arxiv.org/format/2410.16687">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> DARE: Diffusion Policy for Autonomous Robot Exploration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Y">Yuhong Cao</a>, <a href="/search/cs?searchtype=author&query=Lew%2C+J">Jeric Lew</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jingsong Liang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jin Cheng</a>, <a href="/search/cs?searchtype=author&query=Sartoretti%2C+G">Guillaume Sartoretti</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16687v1-abstract-short" style="display: inline;"> Autonomous robot exploration requires a robot to efficiently explore and map unknown environments. Compared to conventional methods that can only optimize paths based on the current robot belief, learning-based methods show the potential to achieve improved performance by drawing on past experiences to reason about unknown areas. In this paper, we propose DARE, a novel generative approach that lev… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16687v1-abstract-full').style.display = 'inline'; document.getElementById('2410.16687v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16687v1-abstract-full" style="display: none;"> Autonomous robot exploration requires a robot to efficiently explore and map unknown environments. Compared to conventional methods that can only optimize paths based on the current robot belief, learning-based methods show the potential to achieve improved performance by drawing on past experiences to reason about unknown areas. In this paper, we propose DARE, a novel generative approach that leverages diffusion models trained on expert demonstrations, which can explicitly generate an exploration path through one-time inference. We build DARE upon an attention-based encoder and a diffusion policy model, and introduce ground truth optimal demonstrations for training to learn better patterns for exploration. The trained planner can reason about the partial belief to recognize the potential structure in unknown areas and consider these areas during path planning. Our experiments demonstrate that DARE achieves on-par performance with both conventional and learning-based state-of-the-art exploration planners, as well as good generalizability in both simulations and real-life scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16687v1-abstract-full').style.display = 'none'; document.getElementById('2410.16687v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.13409">arXiv:2410.13409</a> <span> [<a href="https://arxiv.org/pdf/2410.13409">pdf</a>, <a href="https://arxiv.org/format/2410.13409">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Attr-Int: A Simple and Effective Entity Alignment Framework for Heterogeneous Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+L">Linyan Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jingwei Cheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chuanhao Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xihao Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiayi Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fu Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.13409v2-abstract-short" style="display: inline;"> Entity alignment (EA) refers to the task of linking entities in different knowledge graphs (KGs). Existing EA methods rely heavily on structural isomorphism. However, in real-world KGs, aligned entities usually have non-isomorphic neighborhood structures, which paralyses the application of these structure-dependent methods. In this paper, we investigate and tackle the problem of entity alignment b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13409v2-abstract-full').style.display = 'inline'; document.getElementById('2410.13409v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.13409v2-abstract-full" style="display: none;"> Entity alignment (EA) refers to the task of linking entities in different knowledge graphs (KGs). Existing EA methods rely heavily on structural isomorphism. However, in real-world KGs, aligned entities usually have non-isomorphic neighborhood structures, which paralyses the application of these structure-dependent methods. In this paper, we investigate and tackle the problem of entity alignment between heterogeneous KGs. First, we propose two new benchmarks to closely simulate real-world EA scenarios of heterogeneity. Then we conduct extensive experiments to evaluate the performance of representative EA methods on the new benchmarks. Finally, we propose a simple and effective entity alignment framework called Attr-Int, in which innovative attribute information interaction methods can be seamlessly integrated with any embedding encoder for entity alignment, improving the performance of existing entity alignment techniques. Experiments demonstrate that our framework outperforms the state-of-the-art approaches on two new benchmarks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.13409v2-abstract-full').style.display = 'none'; document.getElementById('2410.13409v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10998">arXiv:2410.10998</a> <span> [<a href="https://arxiv.org/pdf/2410.10998">pdf</a>, <a href="https://arxiv.org/format/2410.10998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WILT: A Multi-Turn, Memorization-Robust Inductive Logic Benchmark for LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Banatt%2C+E">Eryk Banatt</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jonathan Cheng</a>, <a href="/search/cs?searchtype=author&query=Vaidyanath%2C+S">Skanda Vaidyanath</a>, <a href="/search/cs?searchtype=author&query=Hwu%2C+T">Tiffany Hwu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10998v1-abstract-short" style="display: inline;"> While large language models have shown impressive capabilities across a wide range of domains, they still encounter significant challenges in reasoning tasks that require gathering evidence over multiple turns and drawing logical conclusions. These challenges present significant obstacles for LLM chat user interfaces, which rely on multi-turn interactions to facilitate effective collaboration. Thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10998v1-abstract-full').style.display = 'inline'; document.getElementById('2410.10998v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10998v1-abstract-full" style="display: none;"> While large language models have shown impressive capabilities across a wide range of domains, they still encounter significant challenges in reasoning tasks that require gathering evidence over multiple turns and drawing logical conclusions. These challenges present significant obstacles for LLM chat user interfaces, which rely on multi-turn interactions to facilitate effective collaboration. This limitation leads to real-world issues; for example, service chatbots must gather necessary information from customers over multiple turns to diagnose and resolve problems effectively. Despite the multi-turn nature of many real-world LLM use cases, most existing benchmarks rely on carefully curated single-turn tests, which often blur the line between memorization and genuine reasoning. To address this, we introduce the Wason Inductive Logic Test (WILT), a simple yet challenging multi-turn reasoning benchmark designed to resist memorization. WILT is inspired by the Wason 2-4-6 task, where participants must infer a boolean function involving three variables (e.g., $x < y < z$) by proposing test cases (such as $(2, 4, 6)$). In WILT, each test starts from a clean slate, with only the initial instructions provided, preventing models from relying on pre-learned responses. Over several turns, models must interact with the environment by suggesting test cases to narrow the possible hypotheses and ultimately infer the hidden function based on the outcomes. Our findings reveal that LLMs struggle with this task, exhibiting distinct strengths and weaknesses: some are better at narrowing down the hypothesis space by proposing valuable test cases, while others are more adept at deducing the hidden function from observed cases. Despite these variations, the best-performing model achieves only 28% accuracy, highlighting a significant gap in LLM performance on complex multi-turn reasoning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10998v1-abstract-full').style.display = 'none'; document.getElementById('2410.10998v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submitted to ICLR 2025. Preprint version 1</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10456">arXiv:2410.10456</a> <span> </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Ada-K Routing: Boosting the Efficiency of MoE-based LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yue%2C+T">Tongtian Yue</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+L">Longteng Guo</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+X">Xuange Gao</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jing Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10456v2-abstract-short" style="display: inline;"> In the era of Large Language Models (LLMs), Mixture-of-Experts (MoE) architectures offer a promising approach to managing computational costs while scaling up model parameters. Conventional MoE-based LLMs typically employ static Top-K routing, which activates a fixed and equal number of experts for each token regardless of their significance within the context. In this paper, we propose a novel Ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10456v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10456v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10456v2-abstract-full" style="display: none;"> In the era of Large Language Models (LLMs), Mixture-of-Experts (MoE) architectures offer a promising approach to managing computational costs while scaling up model parameters. Conventional MoE-based LLMs typically employ static Top-K routing, which activates a fixed and equal number of experts for each token regardless of their significance within the context. In this paper, we propose a novel Ada-K routing strategy that dynamically adjusts the number of activated experts for each token, thereby improving the balance between computational efficiency and model performance. Specifically, our strategy incorporates learnable and lightweight allocator modules that decide customized expert resource allocation tailored to the contextual needs for each token. These allocators are designed to be fully pluggable, making it broadly applicable across all mainstream MoE-based LLMs. We leverage the Proximal Policy Optimization (PPO) algorithm to facilitate an end-to-end learning process for this non-differentiable decision-making framework. Extensive evaluations on four popular baseline models demonstrate that our Ada-K routing method significantly outperforms conventional Top-K routing. Compared to Top-K, our method achieves over 25% reduction in FLOPs and more than 20% inference speedup while still improving performance across various benchmarks. Moreover, the training of Ada-K is highly efficient. Even for Mixtral-8x22B, a MoE-based LLM with more than 140B parameters, the training time is limited to 8 hours. Detailed analysis shows that harder tasks, middle layers, and content words tend to activate more experts, providing valuable insights for future adaptive MoE system designs. Both the training code and model checkpoints will be publicly available. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10456v2-abstract-full').style.display = 'none'; document.getElementById('2410.10456v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Coauthors do not reach a consensus on submitting the current version</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09492">arXiv:2410.09492</a> <span> [<a href="https://arxiv.org/pdf/2410.09492">pdf</a>, <a href="https://arxiv.org/format/2410.09492">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> A Simple yet Effective Subway Self-positioning Method based on Aerial-view Sleeper Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiajie Song</a>, <a href="/search/cs?searchtype=author&query=Song%2C+N">Ningfang Song</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+X">Xiong Pan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoxin Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C">Can Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jingchun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09492v1-abstract-short" style="display: inline;"> With the rapid development of urban underground rail vehicles,subway positioning, which plays a fundamental role in the traffic navigation and collision avoidance systems, has become a research hot-spot these years. Most current subway positioning methods rely on localization beacons densely pre-installed alongside the railway tracks, requiring massive costs for infrastructure and maintenance, whi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09492v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09492v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09492v1-abstract-full" style="display: none;"> With the rapid development of urban underground rail vehicles,subway positioning, which plays a fundamental role in the traffic navigation and collision avoidance systems, has become a research hot-spot these years. Most current subway positioning methods rely on localization beacons densely pre-installed alongside the railway tracks, requiring massive costs for infrastructure and maintenance, while commonly lacking flexibility and anti-interference ability. In this paper, we propose a low-cost and real-time visual-assisted self-localization framework to address the robust and convenient positioning problem for subways. Firstly, we perform aerial view rail sleeper detection based on the fast and efficient YOLOv8n network. The detection results are then used to achieve real-time correction of mileage values combined with geometric positioning information, obtaining precise subway locations. Front camera Videos for subway driving scenes along a 6.9 km route are collected and annotated from the simulator for validation of the proposed method. Experimental results show that our aerial view sleeper detection algorithm can efficiently detect sleeper positions with F1-score of 0.929 at 1111 fps, and that the proposed positioning framework achieves a mean percentage error of 0.1\%, demonstrating its continuous and high-precision self-localization capability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09492v1-abstract-full').style.display = 'none'; document.getElementById('2410.09492v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages,8 figures, under review for IEEE Sensors Journal publication</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.09407">arXiv:2410.09407</a> <span> [<a href="https://arxiv.org/pdf/2410.09407">pdf</a>, <a href="https://arxiv.org/format/2410.09407">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CAMPHOR: Collaborative Agents for Multi-input Planning and High-Order Reasoning On Device </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yicheng Fu</a>, <a href="/search/cs?searchtype=author&query=Anantha%2C+R">Raviteja Anantha</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jianpeng Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.09407v1-abstract-short" style="display: inline;"> While server-side Large Language Models (LLMs) demonstrate proficiency in function calling and complex reasoning, deploying Small Language Models (SLMs) directly on devices brings opportunities to improve latency and privacy but also introduces unique challenges for accuracy and memory. We introduce CAMPHOR, an innovative on-device SLM multi-agent framework designed to handle multiple user inputs… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09407v1-abstract-full').style.display = 'inline'; document.getElementById('2410.09407v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.09407v1-abstract-full" style="display: none;"> While server-side Large Language Models (LLMs) demonstrate proficiency in function calling and complex reasoning, deploying Small Language Models (SLMs) directly on devices brings opportunities to improve latency and privacy but also introduces unique challenges for accuracy and memory. We introduce CAMPHOR, an innovative on-device SLM multi-agent framework designed to handle multiple user inputs and reason over personal context locally, ensuring privacy is maintained. CAMPHOR employs a hierarchical architecture where a high-order reasoning agent decomposes complex tasks and coordinates expert agents responsible for personal context retrieval, tool interaction, and dynamic plan generation. By implementing parameter sharing across agents and leveraging prompt compression, we significantly reduce model size, latency, and memory usage. To validate our approach, we present a novel dataset capturing multi-agent task trajectories centered on personalized mobile assistant use-cases. Our experiments reveal that fine-tuned SLM agents not only surpass closed-source LLMs in task completion F1 by~35\% but also eliminate the need for server-device communication, all while enhancing privacy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.09407v1-abstract-full').style.display = 'none'; document.getElementById('2410.09407v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06722">arXiv:2410.06722</a> <span> [<a href="https://arxiv.org/pdf/2410.06722">pdf</a>, <a href="https://arxiv.org/format/2410.06722">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Scaling Laws for Mixed quantization in Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zeyu Cao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Cheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Gimenes%2C+P">Pedro Gimenes</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+J">Jianqiao Lu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jianyi Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yiren Zhao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06722v1-abstract-short" style="display: inline;"> Post-training quantization of Large Language Models (LLMs) has proven effective in reducing the computational requirements for running inference on these models. In this study, we focus on a straightforward question: When aiming for a specific accuracy or perplexity target for low-precision quantization, how many high-precision numbers or calculations are required to preserve as we scale LLMs to l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06722v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06722v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06722v1-abstract-full" style="display: none;"> Post-training quantization of Large Language Models (LLMs) has proven effective in reducing the computational requirements for running inference on these models. In this study, we focus on a straightforward question: When aiming for a specific accuracy or perplexity target for low-precision quantization, how many high-precision numbers or calculations are required to preserve as we scale LLMs to larger sizes? We first introduce a critical metric named the quantization ratio, which compares the number of parameters quantized to low-precision arithmetic against the total parameter count. Through extensive and carefully controlled experiments across different model families, arithmetic types, and quantization granularities (e.g. layer-wise, matmul-wise), we identify two central phenomenons. 1) The larger the models, the better they can preserve performance with an increased quantization ratio, as measured by perplexity in pre-training tasks or accuracy in downstream tasks. 2) The finer the granularity of mixed-precision quantization (e.g., matmul-wise), the more the model can increase the quantization ratio. We believe these observed phenomena offer valuable insights for future AI hardware design and the development of advanced Efficient AI algorithms. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06722v1-abstract-full').style.display = 'none'; document.getElementById('2410.06722v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.06527">arXiv:2410.06527</a> <span> [<a href="https://arxiv.org/pdf/2410.06527">pdf</a>, <a href="https://arxiv.org/format/2410.06527">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> The Sampling-Gaussian for stereo matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Pan%2C+B">Baiyu Pan</a>, <a href="/search/cs?searchtype=author&query=jiao%2C+j">jichao jiao</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+B">Bowen Yao</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+J">Jianxin Pang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.06527v1-abstract-short" style="display: inline;"> The soft-argmax operation is widely adopted in neural network-based stereo matching methods to enable differentiable regression of disparity. However, network trained with soft-argmax is prone to being multimodal due to absence of explicit constraint to the shape of the probability distribution. Previous methods leverages Laplacian distribution and cross-entropy for training but failed to effectiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06527v1-abstract-full').style.display = 'inline'; document.getElementById('2410.06527v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.06527v1-abstract-full" style="display: none;"> The soft-argmax operation is widely adopted in neural network-based stereo matching methods to enable differentiable regression of disparity. However, network trained with soft-argmax is prone to being multimodal due to absence of explicit constraint to the shape of the probability distribution. Previous methods leverages Laplacian distribution and cross-entropy for training but failed to effectively improve the accuracy and even compromises the efficiency of the network. In this paper, we conduct a detailed analysis of the previous distribution-based methods and propose a novel supervision method for stereo matching, Sampling-Gaussian. We sample from the Gaussian distribution for supervision. Moreover, we interpret the training as minimizing the distance in vector space and propose a combined loss of L1 loss and cosine similarity loss. Additionally, we leveraged bilinear interpolation to upsample the cost volume. Our method can be directly applied to any soft-argmax-based stereo matching method without a reduction in efficiency. We have conducted comprehensive experiments to demonstrate the superior performance of our Sampling-Gaussian. The experimental results prove that we have achieved better accuracy on five baseline methods and two datasets. Our method is easy to implement, and the code is available online. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.06527v1-abstract-full').style.display = 'none'; document.getElementById('2410.06527v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">TL;DR: A novel Gaussian distribution-based supervision method for stereo matching. Implemented with five baseline methods and achieves notable improvement. Main content, 10 pages. conference submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.03907">arXiv:2410.03907</a> <span> [<a href="https://arxiv.org/pdf/2410.03907">pdf</a>, <a href="https://arxiv.org/format/2410.03907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> ActPlan-1K: Benchmarking the Procedural Planning Ability of Visual Language Models in Household Activities </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Su%2C+Y">Ying Su</a>, <a href="/search/cs?searchtype=author&query=Ling%2C+Z">Zhan Ling</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+H">Haochen Shi</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiayang Cheng</a>, <a href="/search/cs?searchtype=author&query=Yim%2C+Y">Yauwai Yim</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Y">Yangqiu Song</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.03907v1-abstract-short" style="display: inline;"> Large language models~(LLMs) have been adopted to process textual task description and accomplish procedural planning in embodied AI tasks because of their powerful reasoning ability. However, there is still lack of study on how vision language models~(VLMs) behave when multi-modal task inputs are considered. Counterfactual planning that evaluates the model's reasoning ability over alternative tas… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03907v1-abstract-full').style.display = 'inline'; document.getElementById('2410.03907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.03907v1-abstract-full" style="display: none;"> Large language models~(LLMs) have been adopted to process textual task description and accomplish procedural planning in embodied AI tasks because of their powerful reasoning ability. However, there is still lack of study on how vision language models~(VLMs) behave when multi-modal task inputs are considered. Counterfactual planning that evaluates the model's reasoning ability over alternative task situations are also under exploited. In order to evaluate the planning ability of both multi-modal and counterfactual aspects, we propose ActPlan-1K. ActPlan-1K is a multi-modal planning benchmark constructed based on ChatGPT and household activity simulator iGibson2. The benchmark consists of 153 activities and 1,187 instances. Each instance describing one activity has a natural language task description and multiple environment images from the simulator. The gold plan of each instance is action sequences over the objects in provided scenes. Both the correctness and commonsense satisfaction are evaluated on typical VLMs. It turns out that current VLMs are still struggling at generating human-level procedural plans for both normal activities and counterfactual activities. We further provide automatic evaluation metrics by finetuning over BLEURT model to facilitate future research on our benchmark. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.03907v1-abstract-full').style.display = 'none'; document.getElementById('2410.03907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">13 pages, 9 figures, 8 tables, accepted to EMNLP 2024 main conference</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.00564">arXiv:2410.00564</a> <span> [<a href="https://arxiv.org/pdf/2410.00564">pdf</a>, <a href="https://arxiv.org/format/2410.00564">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Scaling Offline Model-Based RL via Jointly-Optimized World-Action Model Pretraining </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Qiao%2C+R">Ruixi Qiao</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+G">Gang Xiong</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+Q">Qinghai Miao</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yingwei Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+B">Binhua Li</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yongbin Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+Y">Yisheng Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.00564v2-abstract-short" style="display: inline;"> A significant aspiration of offline reinforcement learning (RL) is to develop a generalist agent with high capabilities from large and heterogeneous datasets. However, prior approaches that scale offline RL either rely heavily on expert trajectories or struggle to generalize to diverse unseen tasks. Inspired by the excellent generalization of world model in conditional video generation, we explore… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00564v2-abstract-full').style.display = 'inline'; document.getElementById('2410.00564v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.00564v2-abstract-full" style="display: none;"> A significant aspiration of offline reinforcement learning (RL) is to develop a generalist agent with high capabilities from large and heterogeneous datasets. However, prior approaches that scale offline RL either rely heavily on expert trajectories or struggle to generalize to diverse unseen tasks. Inspired by the excellent generalization of world model in conditional video generation, we explore the potential of image observation-based world model for scaling offline RL and enhancing generalization on novel tasks. In this paper, we introduce JOWA: Jointly-Optimized World-Action model, an offline model-based RL agent pretrained on multiple Atari games with 6 billion tokens data to learn general-purpose representation and decision-making ability. Our method jointly optimizes a world-action model through a shared transformer backbone, which stabilize temporal difference learning with large models during pretraining. Moreover, we propose a provably efficient and parallelizable planning algorithm to compensate for the Q-value estimation error and thus search out better policies. Experimental results indicate that our largest agent, with 150 million parameters, achieves 78.9% human-level performance on pretrained games using only 10% subsampled offline data, outperforming existing state-of-the-art large-scale offline RL baselines by 31.6% on averange. Furthermore, JOWA scales favorably with model capacity and can sample-efficiently transfer to novel games using only 5k offline fine-tuning data (approximately 4 trajectories) per game, demonstrating superior generalization. We will release codes and model weights at https://github.com/CJReinforce/JOWA. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.00564v2-abstract-full').style.display = 'none'; document.getElementById('2410.00564v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 1 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17266">arXiv:2409.17266</a> <span> [<a href="https://arxiv.org/pdf/2409.17266">pdf</a>, <a href="https://arxiv.org/format/2409.17266">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> </div> </div> <p class="title is-5 mathjax"> AAPM: Large Language Model Agent-based Asset Pricing Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junyan Cheng</a>, <a href="/search/cs?searchtype=author&query=Chin%2C+P">Peter Chin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17266v1-abstract-short" style="display: inline;"> In this study, we propose a novel asset pricing approach, LLM Agent-based Asset Pricing Models (AAPM), which fuses qualitative discretionary investment analysis from LLM agents and quantitative manual financial economic factors to predict excess asset returns. The experimental results show that our approach outperforms machine learning-based asset pricing baselines in portfolio optimization and as… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17266v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17266v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17266v1-abstract-full" style="display: none;"> In this study, we propose a novel asset pricing approach, LLM Agent-based Asset Pricing Models (AAPM), which fuses qualitative discretionary investment analysis from LLM agents and quantitative manual financial economic factors to predict excess asset returns. The experimental results show that our approach outperforms machine learning-based asset pricing baselines in portfolio optimization and asset pricing errors. Specifically, the Sharpe ratio and average $|伪|$ for anomaly portfolios improved significantly by 9.6\% and 10.8\% respectively. In addition, we conducted extensive ablation studies on our model and analysis of the data to reveal further insights into the proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17266v1-abstract-full').style.display = 'none'; document.getElementById('2409.17266v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.17091">arXiv:2409.17091</a> <span> [<a href="https://arxiv.org/pdf/2409.17091">pdf</a>, <a href="https://arxiv.org/format/2409.17091">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Ctrl-GenAug: Controllable Generative Augmentation for Medical Sequence Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinrui Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Y">Yuhao Huang</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+H">Haoran Dou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Shijing Chen</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+A">Ao Chang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jia Liu</a>, <a href="/search/cs?searchtype=author&query=Long%2C+W">Weiran Long</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+J">Jian Zheng</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+E">Erjiao Xu</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jie Ren</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+R">Ruobing Huang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jun Cheng</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+W">Wufeng Xue</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+D">Dong Ni</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.17091v1-abstract-short" style="display: inline;"> In the medical field, the limited availability of large-scale datasets and labor-intensive annotation processes hinder the performance of deep models. Diffusion-based generative augmentation approaches present a promising solution to this issue, having been proven effective in advancing downstream medical recognition tasks. Nevertheless, existing works lack sufficient semantic and sequential steer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17091v1-abstract-full').style.display = 'inline'; document.getElementById('2409.17091v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.17091v1-abstract-full" style="display: none;"> In the medical field, the limited availability of large-scale datasets and labor-intensive annotation processes hinder the performance of deep models. Diffusion-based generative augmentation approaches present a promising solution to this issue, having been proven effective in advancing downstream medical recognition tasks. Nevertheless, existing works lack sufficient semantic and sequential steerability for challenging video/3D sequence generation, and neglect quality control of noisy synthesized samples, resulting in unreliable synthetic databases and severely limiting the performance of downstream tasks. In this work, we present Ctrl-GenAug, a novel and general generative augmentation framework that enables highly semantic- and sequential-customized sequence synthesis and suppresses incorrectly synthesized samples, to aid medical sequence classification. Specifically, we first design a multimodal conditions-guided sequence generator for controllably synthesizing diagnosis-promotive samples. A sequential augmentation module is integrated to enhance the temporal/stereoscopic coherence of generated samples. Then, we propose a noisy synthetic data filter to suppress unreliable cases at semantic and sequential levels. Extensive experiments on 3 medical datasets, using 11 networks trained on 3 paradigms, comprehensively analyze the effectiveness and generality of Ctrl-GenAug, particularly in underrepresented high-risk populations and out-domain conditions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.17091v1-abstract-full').style.display = 'none'; document.getElementById('2409.17091v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">17 pages, 7 figures, 7 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.14939">arXiv:2409.14939</a> <span> [<a href="https://arxiv.org/pdf/2409.14939">pdf</a>, <a href="https://arxiv.org/format/2409.14939">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3622781.3674167">10.1145/3622781.3674167 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> FastGL: A GPU-Efficient Framework for Accelerating Sampling-Based GNN Training at Large Scale </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Z">Zeyu Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+P">Peisong Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+Q">Qinghao Hu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+G">Gang Li</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+X">Xiaoyao Liang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.14939v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have shown great superiority on non-Euclidean graph data, achieving ground-breaking performance on various graph-related tasks. As a practical solution to train GNN on large graphs with billions of nodes and edges, the sampling-based training is widely adopted by existing training frameworks. However, through an in-depth analysis, we observe that the efficiency of exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14939v1-abstract-full').style.display = 'inline'; document.getElementById('2409.14939v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.14939v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have shown great superiority on non-Euclidean graph data, achieving ground-breaking performance on various graph-related tasks. As a practical solution to train GNN on large graphs with billions of nodes and edges, the sampling-based training is widely adopted by existing training frameworks. However, through an in-depth analysis, we observe that the efficiency of existing sampling-based training frameworks is still limited due to the key bottlenecks lying in all three phases of sampling-based training, i.e., subgraph sample, memory IO, and computation. To this end, we propose FastGL, a GPU-efficient Framework for accelerating sampling-based training of GNN at Large scale by simultaneously optimizing all above three phases, taking into account both GPU characteristics and graph structure. Specifically, by exploiting the inherent overlap within graph structures, FastGL develops the Match-Reorder strategy to reduce the data traffic, which accelerates the memory IO without incurring any GPU memory overhead. Additionally, FastGL leverages a Memory-Aware computation method, harnessing the GPU memory's hierarchical nature to mitigate irregular data access during computation. FastGL further incorporates the Fused-Map approach aimed at diminishing the synchronization overhead during sampling. Extensive experiments demonstrate that FastGL can achieve an average speedup of 11.8x, 2.2x and 1.5x over the state-of-the-art frameworks PyG, DGL, and GNNLab, respectively.Our code is available at https://github.com/a1bc2def6g/fastgl-ae. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.14939v1-abstract-full').style.display = 'none'; document.getElementById('2409.14939v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ASPLOS 2024 fall cycle after major revision</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.13282">arXiv:2409.13282</a> <span> [<a href="https://arxiv.org/pdf/2409.13282">pdf</a>, <a href="https://arxiv.org/format/2409.13282">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Velocity Field: An Informative Traveling Cost Representation for Trajectory Planning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xin%2C+R">Ren Xin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jie Cheng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Sheng Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Ming Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.13282v1-abstract-short" style="display: inline;"> Trajectory planning involves generating a series of space points to be followed in the near future. However, due to the complex and uncertain nature of the driving environment, it is impractical for autonomous vehicles~(AVs) to exhaustively design planning rules for optimizing future trajectories. To address this issue, we propose a local map representation method called Velocity Field. This appro… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13282v1-abstract-full').style.display = 'inline'; document.getElementById('2409.13282v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.13282v1-abstract-full" style="display: none;"> Trajectory planning involves generating a series of space points to be followed in the near future. However, due to the complex and uncertain nature of the driving environment, it is impractical for autonomous vehicles~(AVs) to exhaustively design planning rules for optimizing future trajectories. To address this issue, we propose a local map representation method called Velocity Field. This approach provides heading and velocity priors for trajectory planning tasks, simplifying the planning process in complex urban driving. The heading and velocity priors can be learned from demonstrations of human drivers using our proposed loss. Additionally, we developed an iterative sampling-based planner to train and compare the differences between local map representations. We investigated local map representation forms for planning performance on a real-world dataset. Compared to learned rasterized cost maps, our method demonstrated greater reliability and computational efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.13282v1-abstract-full').style.display = 'none'; document.getElementById('2409.13282v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">ROBIO 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.11664">arXiv:2409.11664</a> <span> [<a href="https://arxiv.org/pdf/2409.11664">pdf</a>, <a href="https://arxiv.org/format/2409.11664">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3664647.3681425">10.1145/3664647.3681425 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Agent Aggregator with Mask Denoise Mechanism for Histopathology Whole Slide Image Analysis </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ling%2C+X">Xitong Ling</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+M">Minxi Ouyang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yizhi Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xinrui Chen</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Renao Yan</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+H">Hongbo Chu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junru Cheng</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+T">Tian Guan</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+S">Sufang Tian</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoping Liu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yonghong He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.11664v1-abstract-short" style="display: inline;"> Histopathology analysis is the gold standard for medical diagnosis. Accurate classification of whole slide images (WSIs) and region-of-interests (ROIs) localization can assist pathologists in diagnosis. The gigapixel resolution of WSI and the absence of fine-grained annotations make direct classification and analysis challenging. In weakly supervised learning, multiple instance learning (MIL) pres… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11664v1-abstract-full').style.display = 'inline'; document.getElementById('2409.11664v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.11664v1-abstract-full" style="display: none;"> Histopathology analysis is the gold standard for medical diagnosis. Accurate classification of whole slide images (WSIs) and region-of-interests (ROIs) localization can assist pathologists in diagnosis. The gigapixel resolution of WSI and the absence of fine-grained annotations make direct classification and analysis challenging. In weakly supervised learning, multiple instance learning (MIL) presents a promising approach for WSI classification. The prevailing strategy is to use attention mechanisms to measure instance importance for classification. However, attention mechanisms fail to capture inter-instance information, and self-attention causes quadratic computational complexity. To address these challenges, we propose AMD-MIL, an agent aggregator with a mask denoise mechanism. The agent token acts as an intermediate variable between the query and key for computing instance importance. Mask and denoising matrices, mapped from agents-aggregated value, dynamically mask low-contribution representations and eliminate noise. AMD-MIL achieves better attention allocation by adjusting feature representations, capturing micro-metastases in cancer, and improving interpretability. Extensive experiments on CAMELYON-16, CAMELYON-17, TCGA-KIDNEY, and TCGA-LUNG show AMD-MIL's superiority over state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.11664v1-abstract-full').style.display = 'none'; document.getElementById('2409.11664v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.08537">arXiv:2409.08537</a> <span> [<a href="https://arxiv.org/pdf/2409.08537">pdf</a>, <a href="https://arxiv.org/format/2409.08537">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SRE-CNN: A Spatiotemporal Rotation-Equivariant CNN for Cardiac Cine MR Imaging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yuliang Zhu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jing Cheng</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+Z">Zhuo-Xu Cui</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+J">Jianfeng Ren</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengbo Wang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+D">Dong Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.08537v1-abstract-short" style="display: inline;"> Dynamic MR images possess various transformation symmetries,including the rotation symmetry of local features within the image and along the temporal dimension. Utilizing these symmetries as prior knowledge can facilitate dynamic MR imaging with high spatiotemporal resolution. Equivariant CNN is an effective tool to leverage the symmetry priors. However, current equivariant CNN methods fail to ful… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08537v1-abstract-full').style.display = 'inline'; document.getElementById('2409.08537v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.08537v1-abstract-full" style="display: none;"> Dynamic MR images possess various transformation symmetries,including the rotation symmetry of local features within the image and along the temporal dimension. Utilizing these symmetries as prior knowledge can facilitate dynamic MR imaging with high spatiotemporal resolution. Equivariant CNN is an effective tool to leverage the symmetry priors. However, current equivariant CNN methods fail to fully exploit these symmetry priors in dynamic MR imaging. In this work, we propose a novel framework of Spatiotemporal Rotation-Equivariant CNN (SRE-CNN), spanning from the underlying high-precision filter design to the construction of the temporal-equivariant convolutional module and imaging model, to fully harness the rotation symmetries inherent in dynamic MR images. The temporal-equivariant convolutional module enables exploitation the rotation symmetries in both spatial and temporal dimensions, while the high-precision convolutional filter, based on parametrization strategy, enhances the utilization of rotation symmetry of local features to improve the reconstruction of detailed anatomical structures. Experiments conducted on highly undersampled dynamic cardiac cine data (up to 20X) have demonstrated the superior performance of our proposed approach, both quantitatively and qualitatively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.08537v1-abstract-full').style.display = 'none'; document.getElementById('2409.08537v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at MICCAI 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04847">arXiv:2409.04847</a> <span> [<a href="https://arxiv.org/pdf/2409.04847">pdf</a>, <a href="https://arxiv.org/format/2409.04847">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Rethinking The Training And Evaluation of Rich-Context Layout-to-Image Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiaxin Cheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zixu Zhao</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tong He</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tianjun Xiao</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yicong Zhou</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zheng Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04847v1-abstract-short" style="display: inline;"> Recent advancements in generative models have significantly enhanced their capacity for image generation, enabling a wide range of applications such as image editing, completion and video editing. A specialized area within generative modeling is layout-to-image (L2I) generation, where predefined layouts of objects guide the generative process. In this study, we introduce a novel regional cross-att… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04847v1-abstract-full').style.display = 'inline'; document.getElementById('2409.04847v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04847v1-abstract-full" style="display: none;"> Recent advancements in generative models have significantly enhanced their capacity for image generation, enabling a wide range of applications such as image editing, completion and video editing. A specialized area within generative modeling is layout-to-image (L2I) generation, where predefined layouts of objects guide the generative process. In this study, we introduce a novel regional cross-attention module tailored to enrich layout-to-image generation. This module notably improves the representation of layout regions, particularly in scenarios where existing methods struggle with highly complex and detailed textual descriptions. Moreover, while current open-vocabulary L2I methods are trained in an open-set setting, their evaluations often occur in closed-set environments. To bridge this gap, we propose two metrics to assess L2I performance in open-vocabulary scenarios. Additionally, we conduct a comprehensive user study to validate the consistency of these metrics with human preferences. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04847v1-abstract-full').style.display = 'none'; document.getElementById('2409.04847v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04214">arXiv:2409.04214</a> <span> [<a href="https://arxiv.org/pdf/2409.04214">pdf</a>, <a href="https://arxiv.org/format/2409.04214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Diagram Formalization Enhanced Multi-Modal Geometry Problem Solver </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeren Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jo-Ku Cheng</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+J">Jingyang Deng</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+L">Lu Tian</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+J">Jinwen Ma</a>, <a href="/search/cs?searchtype=author&query=Qin%2C+Z">Ziran Qin</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiaokai Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+N">Na Zhu</a>, <a href="/search/cs?searchtype=author&query=Leng%2C+T">Tuo Leng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04214v2-abstract-short" style="display: inline;"> Mathematical reasoning remains an ongoing challenge for AI models, especially for geometry problems that require both linguistic and visual signals. As the vision encoders of most MLLMs are trained on natural scenes, they often struggle to understand geometric diagrams, performing no better in geometry problem solving than LLMs that only process text. This limitation is amplified by the lack of ef… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04214v2-abstract-full').style.display = 'inline'; document.getElementById('2409.04214v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04214v2-abstract-full" style="display: none;"> Mathematical reasoning remains an ongoing challenge for AI models, especially for geometry problems that require both linguistic and visual signals. As the vision encoders of most MLLMs are trained on natural scenes, they often struggle to understand geometric diagrams, performing no better in geometry problem solving than LLMs that only process text. This limitation is amplified by the lack of effective methods for representing geometric relationships. To address these issues, we introduce the Diagram Formalization Enhanced Geometry Problem Solver (DFE-GPS), a new framework that integrates visual features, geometric formal language, and natural language representations. We propose a novel synthetic data approach and create a large-scale geometric dataset, SynthGeo228K, annotated with both formal and natural language captions, designed to enhance the vision encoder for a better understanding of geometric structures. Our framework improves MLLMs' ability to process geometric diagrams and extends their application to open-ended tasks on the formalgeo7k dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04214v2-abstract-full').style.display = 'none'; document.getElementById('2409.04214v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.04081">arXiv:2409.04081</a> <span> [<a href="https://arxiv.org/pdf/2409.04081">pdf</a>, <a href="https://arxiv.org/format/2409.04081">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> UI-JEPA: Towards Active Perception of User Intent through Onscreen User Activity </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yicheng Fu</a>, <a href="/search/cs?searchtype=author&query=Anantha%2C+R">Raviteja Anantha</a>, <a href="/search/cs?searchtype=author&query=Vashisht%2C+P">Prabal Vashisht</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jianpeng Cheng</a>, <a href="/search/cs?searchtype=author&query=Littwin%2C+E">Etai Littwin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.04081v3-abstract-short" style="display: inline;"> Generating user intent from a sequence of user interface (UI) actions is a core challenge in comprehensive UI understanding. Recent advancements in multimodal large language models (MLLMs) have led to substantial progress in this area, but their demands for extensive model parameters, computing power, and high latency makes them impractical for scenarios requiring lightweight, on-device solutions… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04081v3-abstract-full').style.display = 'inline'; document.getElementById('2409.04081v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.04081v3-abstract-full" style="display: none;"> Generating user intent from a sequence of user interface (UI) actions is a core challenge in comprehensive UI understanding. Recent advancements in multimodal large language models (MLLMs) have led to substantial progress in this area, but their demands for extensive model parameters, computing power, and high latency makes them impractical for scenarios requiring lightweight, on-device solutions with low latency or heightened privacy. Additionally, the lack of high-quality datasets has hindered the development of such lightweight models. To address these challenges, we propose UI-JEPA, a novel framework that employs masking strategies to learn abstract UI embeddings from unlabeled data through self-supervised learning, combined with an LLM decoder fine-tuned for user intent prediction. We also introduce two new UI-grounded multimodal datasets, "Intent in the Wild" (IIW) and "Intent in the Tame" (IIT), designed for few-shot and zero-shot UI understanding tasks. IIW consists of 1.7K videos across 219 intent categories, while IIT contains 914 videos across 10 categories. We establish the first baselines for these datasets, showing that representations learned using a JEPA-style objective, combined with an LLM decoder, can achieve user intent predictions that match the performance of state-of-the-art large MLLMs, but with significantly reduced annotation and deployment resources. Measured by intent similarity scores, UI-JEPA outperforms GPT-4 Turbo and Claude 3.5 Sonnet by 10.0% and 7.2% respectively, averaged across two datasets. Notably, UI-JEPA accomplishes the performance with a 50.5x reduction in computational cost and a 6.6x improvement in latency in the IIW dataset. These results underscore the effectiveness of UI-JEPA, highlighting its potential for lightweight, high-performance UI understanding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.04081v3-abstract-full').style.display = 'none'; document.getElementById('2409.04081v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 6 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03193">arXiv:2409.03193</a> <span> [<a href="https://arxiv.org/pdf/2409.03193">pdf</a>, <a href="https://arxiv.org/format/2409.03193">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Upper-Limb Rehabilitation with a Dual-Mode Individualized Exoskeleton Robot: A Generative-Model-Based Solution </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Miao%2C+S">Shu Miao</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jing Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+G">Gong Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jianghua Cheng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+K">Ketao Du</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03193v1-abstract-short" style="display: inline;"> Several upper-limb exoskeleton robots have been developed for stroke rehabilitation, but their rather low level of individualized assistance typically limits their effectiveness and practicability. Individualized assistance involves an upper-limb exoskeleton robot continuously assessing feedback from a stroke patient and then meticulously adjusting interaction forces to suit specific conditions an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03193v1-abstract-full').style.display = 'inline'; document.getElementById('2409.03193v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03193v1-abstract-full" style="display: none;"> Several upper-limb exoskeleton robots have been developed for stroke rehabilitation, but their rather low level of individualized assistance typically limits their effectiveness and practicability. Individualized assistance involves an upper-limb exoskeleton robot continuously assessing feedback from a stroke patient and then meticulously adjusting interaction forces to suit specific conditions and online changes. This paper describes the development of a new upper-limb exoskeleton robot with a novel online generative capability that allows it to provide individualized assistance to support the rehabilitation training of stroke patients. Specifically, the upper-limb exoskeleton robot exploits generative models to customize the fine and fit trajectory for the patient, as medical conditions, responses, and comfort feedback during training generally differ between patients. This generative capability is integrated into the two working modes of the upper-limb exoskeleton robot: an active mirroring mode for patients who retain motor abilities on one side of the body and a passive following mode for patients who lack motor ability on both sides of the body. The performance of the upper-limb exoskeleton robot was illustrated in experiments involving healthy subjects and stroke patients. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03193v1-abstract-full').style.display = 'none'; document.getElementById('2409.03193v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.01068">arXiv:2409.01068</a> <span> [<a href="https://arxiv.org/pdf/2409.01068">pdf</a>, <a href="https://arxiv.org/format/2409.01068">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Progressive Retinal Image Registration via Global and Local Deformable Transformations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yepeng Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Baosheng Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tian Chen</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+Y">Yuliang Gu</a>, <a href="/search/cs?searchtype=author&query=Du%2C+B">Bo Du</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongchao Xu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jun Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.01068v2-abstract-short" style="display: inline;"> Retinal image registration plays an important role in the ophthalmological diagnosis process. Since there exist variances in viewing angles and anatomical structures across different retinal images, keypoint-based approaches become the mainstream methods for retinal image registration thanks to their robustness and low latency. These methods typically assume the retinal surfaces are planar, and ad… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01068v2-abstract-full').style.display = 'inline'; document.getElementById('2409.01068v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.01068v2-abstract-full" style="display: none;"> Retinal image registration plays an important role in the ophthalmological diagnosis process. Since there exist variances in viewing angles and anatomical structures across different retinal images, keypoint-based approaches become the mainstream methods for retinal image registration thanks to their robustness and low latency. These methods typically assume the retinal surfaces are planar, and adopt feature matching to obtain the homography matrix that represents the global transformation between images. Yet, such a planar hypothesis inevitably introduces registration errors since retinal surface is approximately curved. This limitation is more prominent when registering image pairs with significant differences in viewing angles. To address this problem, we propose a hybrid registration framework called HybridRetina, which progressively registers retinal images with global and local deformable transformations. For that, we use a keypoint detector and a deformation network called GAMorph to estimate the global transformation and local deformable transformation, respectively. Specifically, we integrate multi-level pixel relation knowledge to guide the training of GAMorph. Additionally, we utilize an edge attention module that includes the geometric priors of the images, ensuring the deformation field focuses more on the vascular regions of clinical interest. Experiments on two widely-used datasets, FIRE and FLoRI21, show that our proposed HybridRetina significantly outperforms some state-of-the-art methods. The code is available at https://github.com/lyp-deeplearning/awesome-retinal-registration. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.01068v2-abstract-full').style.display = 'none'; document.getElementById('2409.01068v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 2 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted at BIBM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00638">arXiv:2409.00638</a> <span> [<a href="https://arxiv.org/pdf/2409.00638">pdf</a>, <a href="https://arxiv.org/format/2409.00638">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IGEV++: Iterative Multi-range Geometry Encoding Volumes for Stereo Matching </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+G">Gangwei Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xianqi Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhaoxing Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Junda Cheng</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+C">Chunyuan Liao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xin Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00638v1-abstract-short" style="display: inline;"> Stereo matching is a core component in many computer vision and robotics systems. Despite significant advances over the last decade, handling matching ambiguities in ill-posed regions and large disparities remains an open challenge. In this paper, we propose a new deep network architecture, called IGEV++, for stereo matching. The proposed IGEV++ builds Multi-range Geometry Encoding Volumes (MGEV)… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00638v1-abstract-full').style.display = 'inline'; document.getElementById('2409.00638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00638v1-abstract-full" style="display: none;"> Stereo matching is a core component in many computer vision and robotics systems. Despite significant advances over the last decade, handling matching ambiguities in ill-posed regions and large disparities remains an open challenge. In this paper, we propose a new deep network architecture, called IGEV++, for stereo matching. The proposed IGEV++ builds Multi-range Geometry Encoding Volumes (MGEV) that encode coarse-grained geometry information for ill-posed regions and large disparities and fine-grained geometry information for details and small disparities. To construct MGEV, we introduce an adaptive patch matching module that efficiently and effectively computes matching costs for large disparity ranges and/or ill-posed regions. We further propose a selective geometry feature fusion module to adaptively fuse multi-range and multi-granularity geometry features in MGEV. We then index the fused geometry features and input them to ConvGRUs to iteratively update the disparity map. MGEV allows to efficiently handle large disparities and ill-posed regions, such as occlusions and textureless regions, and enjoys rapid convergence during iterations. Our IGEV++ achieves the best performance on the Scene Flow test set across all disparity ranges, up to 768px. Our IGEV++ also achieves state-of-the-art accuracy on the Middlebury, ETH3D, KITTI 2012, and 2015 benchmarks. Specifically, IGEV++ achieves a 3.23% 2-pixel outlier rate (Bad 2.0) on the large disparity benchmark, Middlebury, representing error reductions of 31.9% and 54.8% compared to RAFT-Stereo and GMStereo, respectively. We also present a real-time version of IGEV++ that achieves the best performance among all published real-time methods on the KITTI benchmarks. The code is publicly available at https://github.com/gangweiX/IGEV-plusplus <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00638v1-abstract-full').style.display = 'none'; document.getElementById('2409.00638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.17052">arXiv:2408.17052</a> <span> [<a href="https://arxiv.org/pdf/2408.17052">pdf</a>, <a href="https://arxiv.org/format/2408.17052">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Can We Leave Deepfake Data Behind in Training Deepfake Detector? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jikang Cheng</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Z">Zhiyuan Yan</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Ying Zhang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuhao Luo</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhongyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.17052v1-abstract-short" style="display: inline;"> The generalization ability of deepfake detectors is vital for their applications in real-world scenarios. One effective solution to enhance this ability is to train the models with manually-blended data, which we termed "blendfake", encouraging models to learn generic forgery artifacts like blending boundary. Interestingly, current SoTA methods utilize blendfake without incorporating any deepfake… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17052v1-abstract-full').style.display = 'inline'; document.getElementById('2408.17052v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.17052v1-abstract-full" style="display: none;"> The generalization ability of deepfake detectors is vital for their applications in real-world scenarios. One effective solution to enhance this ability is to train the models with manually-blended data, which we termed "blendfake", encouraging models to learn generic forgery artifacts like blending boundary. Interestingly, current SoTA methods utilize blendfake without incorporating any deepfake data in their training process. This is likely because previous empirical observations suggest that vanilla hybrid training (VHT), which combines deepfake and blendfake data, results in inferior performance to methods using only blendfake data (so-called "1+1<2"). Therefore, a critical question arises: Can we leave deepfake behind and rely solely on blendfake data to train an effective deepfake detector? Intuitively, as deepfakes also contain additional informative forgery clues (e.g., deep generative artifacts), excluding all deepfake data in training deepfake detectors seems counter-intuitive. In this paper, we rethink the role of blendfake in detecting deepfakes and formulate the process from "real to blendfake to deepfake" to be a progressive transition. Specifically, blendfake and deepfake can be explicitly delineated as the oriented pivot anchors between "real-to-fake" transitions. The accumulation of forgery information should be oriented and progressively increasing during this transition process. To this end, we propose an Oriented Progressive Regularizor (OPR) to establish the constraints that compel the distribution of anchors to be discretely arranged. Furthermore, we introduce feature bridging to facilitate the smooth transition between adjacent anchors. Extensive experiments confirm that our design allows leveraging forgery information from both blendfake and deepfake effectively and comprehensively. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.17052v1-abstract-full').style.display = 'none'; document.getElementById('2408.17052v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16268">arXiv:2408.16268</a> <span> [<a href="https://arxiv.org/pdf/2408.16268">pdf</a>, <a href="https://arxiv.org/format/2408.16268">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> UDD: Dataset Distillation via Mining Underutilized Regions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiguang Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhongyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16268v1-abstract-short" style="display: inline;"> Dataset distillation synthesizes a small dataset such that a model trained on this set approximates the performance of the original dataset. Recent studies on dataset distillation focused primarily on the design of the optimization process, with methods such as gradient matching, feature alignment, and training trajectory matching. However, little attention has been given to the issue of underutil… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16268v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16268v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16268v1-abstract-full" style="display: none;"> Dataset distillation synthesizes a small dataset such that a model trained on this set approximates the performance of the original dataset. Recent studies on dataset distillation focused primarily on the design of the optimization process, with methods such as gradient matching, feature alignment, and training trajectory matching. However, little attention has been given to the issue of underutilized regions in synthetic images. In this paper, we propose UDD, a novel approach to identify and exploit the underutilized regions to make them informative and discriminate, and thus improve the utilization of the synthetic dataset. Technically, UDD involves two underutilized regions searching policies for different conditions, i.e., response-based policy and data jittering-based policy. Compared with previous works, such two policies are utilization-sensitive, equipping with the ability to dynamically adjust the underutilized regions during the training process. Additionally, we analyze the current model optimization problem and design a category-wise feature contrastive loss, which can enhance the distinguishability of different categories and alleviate the shortcomings of the existing multi-formation methods. Experimentally, our method improves the utilization of the synthetic dataset and outperforms the state-of-the-art methods on various datasets, such as MNIST, FashionMNIST, SVHN, CIFAR-10, and CIFAR-100. For example, the improvements on CIFAR-10 and CIFAR-100 are 4.0\% and 3.7\% over the next best method with IPC=1, by mining the underutilized regions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16268v1-abstract-full').style.display = 'none'; document.getElementById('2408.16268v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">PRCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16238">arXiv:2408.16238</a> <span> [<a href="https://arxiv.org/pdf/2408.16238">pdf</a>, <a href="https://arxiv.org/format/2408.16238">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Efficient Transfer Learning Framework for Cross-Domain Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qi Liu</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xingyuan Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jianqiang Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xiangqian Yu</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Haoran Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jin Chen</a>, <a href="/search/cs?searchtype=author&query=Pu%2C+Y">Yuanhao Pu</a>, <a href="/search/cs?searchtype=author&query=Lian%2C+D">Defu Lian</a>, <a href="/search/cs?searchtype=author&query=Qu%2C+T">Tan Qu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhe Wang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jia Cheng</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+J">Jun Lei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16238v1-abstract-short" style="display: inline;"> Natural content and advertisement coexist in industrial recommendation systems but differ in data distribution. Concretely, traffic related to the advertisement is considerably sparser compared to that of natural content, which motivates the development of transferring knowledge from the richer source natural content domain to the sparser advertising domain. The challenges include the inefficienci… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16238v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16238v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16238v1-abstract-full" style="display: none;"> Natural content and advertisement coexist in industrial recommendation systems but differ in data distribution. Concretely, traffic related to the advertisement is considerably sparser compared to that of natural content, which motivates the development of transferring knowledge from the richer source natural content domain to the sparser advertising domain. The challenges include the inefficiencies arising from the management of extensive source data and the problem of 'catastrophic forgetting' that results from the CTR model's daily updating. To this end, we propose a novel tri-level asynchronous framework, i.e., Efficient Transfer Learning Framework for Cross-Domain Click-Through Rate Prediction (E-CDCTR), to transfer comprehensive knowledge of natural content to advertisement CTR models. This framework consists of three key components: Tiny Pre-training Model ((TPM), which trains a tiny CTR model with several basic features on long-term natural data; Complete Pre-training Model (CPM), which trains a CTR model holding network structure and input features the same as target advertisement on short-term natural data; Advertisement CTR model (A-CTR), which derives its parameter initialization from CPM together with multiple historical embeddings from TPM as extra feature and then fine-tunes on advertisement data. TPM provides richer representations of user and item for both the CPM and A-CTR, effectively alleviating the forgetting problem inherent in the daily updates. CPM further enhances the advertisement model by providing knowledgeable initialization, thereby alleviating the data sparsity challenges typically encountered by advertising CTR models. Such a tri-level cross-domain transfer learning framework offers an efficient solution to address both data sparsity and `catastrophic forgetting', yielding remarkable improvements. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16238v1-abstract-full').style.display = 'none'; document.getElementById('2408.16238v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.16233">arXiv:2408.16233</a> <span> [<a href="https://arxiv.org/pdf/2408.16233">pdf</a>, <a href="https://arxiv.org/format/2408.16233">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> PSE-Net: Channel Pruning for Convolutional Neural Networks with Parallel-subnets Estimator </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shiguang Wang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+T">Tao Xie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Haijun Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xingcheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jian Cheng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.16233v1-abstract-short" style="display: inline;"> Channel Pruning is one of the most widespread techniques used to compress deep neural networks while maintaining their performances. Currently, a typical pruning algorithm leverages neural architecture search to directly find networks with a configurable width, the key step of which is to identify representative subnet for various pruning ratios by training a supernet. However, current methods mai… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16233v1-abstract-full').style.display = 'inline'; document.getElementById('2408.16233v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.16233v1-abstract-full" style="display: none;"> Channel Pruning is one of the most widespread techniques used to compress deep neural networks while maintaining their performances. Currently, a typical pruning algorithm leverages neural architecture search to directly find networks with a configurable width, the key step of which is to identify representative subnet for various pruning ratios by training a supernet. However, current methods mainly follow a serial training strategy to optimize supernet, which is very time-consuming. In this work, we introduce PSE-Net, a novel parallel-subnets estimator for efficient channel pruning. Specifically, we propose a parallel-subnets training algorithm that simulate the forward-backward pass of multiple subnets by droping extraneous features on batch dimension, thus various subnets could be trained in one round. Our proposed algorithm facilitates the efficiency of supernet training and equips the network with the ability to interpolate the accuracy of unsampled subnets, enabling PSE-Net to effectively evaluate and rank the subnets. Over the trained supernet, we develop a prior-distributed-based sampling algorithm to boost the performance of classical evolutionary search. Such algorithm utilizes the prior information of supernet training phase to assist in the search of optimal subnets while tackling the challenge of discovering samples that satisfy resource constraints due to the long-tail distribution of network configuration. Extensive experiments demonstrate PSE-Net outperforms previous state-of-the-art channel pruning methods on the ImageNet dataset while retaining superior supernet training efficiency. For example, under 300M FLOPs constraint, our pruned MobileNetV2 achieves 75.2% Top-1 accuracy on ImageNet dataset, exceeding the original MobileNetV2 by 2.6 units while only cost 30%/16% times than BCNet/AutoAlim. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.16233v1-abstract-full').style.display = 'none'; document.getElementById('2408.16233v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10pages, Neural Networks</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.15778">arXiv:2408.15778</a> <span> [<a href="https://arxiv.org/pdf/2408.15778">pdf</a>, <a href="https://arxiv.org/format/2408.15778">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LogicGame: Benchmarking Rule-Based Reasoning Abilities of Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gui%2C+J">Jiayi Gui</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiming Liu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jiale Cheng</a>, <a href="/search/cs?searchtype=author&query=Gu%2C+X">Xiaotao Gu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiao Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hongning Wang</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+Y">Yuxiao Dong</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jie Tang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+M">Minlie Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.15778v4-abstract-short" style="display: inline;"> Large Language Models (LLMs) have demonstrated notable capabilities across various tasks, showcasing complex problem-solving abilities. Understanding and executing complex rules, along with multi-step planning, are fundamental to logical reasoning and critical for practical LLM agents and decision-making systems. However, evaluating LLMs as effective rule-based executors and planners remains under… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15778v4-abstract-full').style.display = 'inline'; document.getElementById('2408.15778v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.15778v4-abstract-full" style="display: none;"> Large Language Models (LLMs) have demonstrated notable capabilities across various tasks, showcasing complex problem-solving abilities. Understanding and executing complex rules, along with multi-step planning, are fundamental to logical reasoning and critical for practical LLM agents and decision-making systems. However, evaluating LLMs as effective rule-based executors and planners remains underexplored. In this paper, we introduce LogicGame, a novel benchmark designed to evaluate the comprehensive rule understanding, execution, and planning capabilities of LLMs. Unlike traditional benchmarks, LogicGame provides diverse games that contain a series of rules with an initial state, requiring models to comprehend and apply predefined regulations to solve problems. We create simulated scenarios in which models execute or plan operations to achieve specific outcomes. These game scenarios are specifically designed to distinguish logical reasoning from mere knowledge by relying exclusively on predefined rules. This separation allows for a pure assessment of rule-based reasoning capabilities. The evaluation considers not only final outcomes but also intermediate steps, providing a comprehensive assessment of model performance. Moreover, these intermediate steps are deterministic and can be automatically verified. LogicGame defines game scenarios with varying difficulty levels, from simple rule applications to complex reasoning chains, in order to offer a precise evaluation of model performance on rule understanding and multi-step execution. Utilizing LogicGame, we test various LLMs and identify notable shortcomings in their rule-based logical reasoning abilities. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.15778v4-abstract-full').style.display = 'none'; document.getElementById('2408.15778v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13790">arXiv:2408.13790</a> <span> [<a href="https://arxiv.org/pdf/2408.13790">pdf</a>, <a href="https://arxiv.org/format/2408.13790">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CV-MOS: A Cross-View Model for Motion Segmentation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xiaoyu Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zeyu Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jintao Cheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xieyuanli Chen</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Jin Wu</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+B">Bohuan Xue</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13790v1-abstract-short" style="display: inline;"> In autonomous driving, accurately distinguishing between static and moving objects is crucial for the autonomous driving system. When performing the motion object segmentation (MOS) task, effectively leveraging motion information from objects becomes a primary challenge in improving the recognition of moving objects. Previous methods either utilized range view (RV) or bird's eye view (BEV) residua… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13790v1-abstract-full').style.display = 'inline'; document.getElementById('2408.13790v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13790v1-abstract-full" style="display: none;"> In autonomous driving, accurately distinguishing between static and moving objects is crucial for the autonomous driving system. When performing the motion object segmentation (MOS) task, effectively leveraging motion information from objects becomes a primary challenge in improving the recognition of moving objects. Previous methods either utilized range view (RV) or bird's eye view (BEV) residual maps to capture motion information. Unlike traditional approaches, we propose combining RV and BEV residual maps to exploit a greater potential of motion information jointly. Thus, we introduce CV-MOS, a cross-view model for moving object segmentation. Novelty, we decouple spatial-temporal information by capturing the motion from BEV and RV residual maps and generating semantic features from range images, which are used as moving object guidance for the motion branch. Our direct and unique solution maximizes the use of range images and RV and BEV residual maps, significantly enhancing the performance of LiDAR-based MOS task. Our method achieved leading IoU(\%) scores of 77.5\% and 79.2\% on the validation and test sets of the SemanticKitti dataset. In particular, CV-MOS demonstrates SOTA performance to date on various datasets. The CV-MOS implementation is available at https://github.com/SCNU-RISLAB/CV-MOS <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13790v1-abstract-full').style.display = 'none'; document.getElementById('2408.13790v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.13479">arXiv:2408.13479</a> <span> [<a href="https://arxiv.org/pdf/2408.13479">pdf</a>, <a href="https://arxiv.org/format/2408.13479">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Quantum-machine-assisted Drug Discovery: Survey and Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yidong Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jintai Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">Jinglei Cheng</a>, <a href="/search/cs?searchtype=author&query=Karemore%2C+G">Gopal Karemore</a>, <a href="/search/cs?searchtype=author&query=Zitnik%2C+M">Marinka Zitnik</a>, <a href="/search/cs?searchtype=author&query=Chong%2C+F+T">Frederic T. Chong</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Junyu Liu</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+T">Tianfan Fu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Z">Zhiding Liang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.13479v3-abstract-short" style="display: inline;"> Drug discovery and development is a highly complex and costly endeavor, typically requiring over a decade and substantial financial investment to bring a new drug to market. Traditional computer-aided drug design (CADD) has made significant progress in accelerating this process, but the development of quantum computing offers potential due to its unique capabilities. This paper discusses the integ… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13479v3-abstract-full').style.display = 'inline'; document.getElementById('2408.13479v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.13479v3-abstract-full" style="display: none;"> Drug discovery and development is a highly complex and costly endeavor, typically requiring over a decade and substantial financial investment to bring a new drug to market. Traditional computer-aided drug design (CADD) has made significant progress in accelerating this process, but the development of quantum computing offers potential due to its unique capabilities. This paper discusses the integration of quantum computing into drug discovery and development, focusing on how quantum technologies might accelerate and enhance various stages of the drug development cycle. Specifically, we explore the application of quantum computing in addressing challenges related to drug discovery, such as molecular simulation and the prediction of drug-target interactions, as well as the optimization of clinical trial outcomes. By leveraging the inherent capabilities of quantum computing, we might be able to reduce the time and cost associated with bringing new drugs to market, ultimately benefiting public health. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.13479v3-abstract-full').style.display = 'none'; document.getElementById('2408.13479v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">27 pages, 10 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.12984">arXiv:2408.12984</a> <span> [<a href="https://arxiv.org/pdf/2408.12984">pdf</a>, <a href="https://arxiv.org/format/2408.12984">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Materials Science">cond-mat.mtrl-sci</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> PDDFormer: Pairwise Distance Distribution Graph Transformer for Crystal Material Property Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiangxiang Shen</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+Z">Zheng Wan</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+L">Lingfeng Wen</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+L">Licheng Sun</a>, <a href="/search/cs?searchtype=author&query=Jie%2C+O+Y+M">Ou Yang Ming Jie</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+J">JiJUn Cheng</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xuan Tang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+X">Xian Wei</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.12984v4-abstract-short" style="display: inline;"> The crystal structure can be simplified as a periodic point set repeating across the entire three-dimensional space along an underlying lattice. Traditionally, methods for representing crystals rely on descriptors like lattice parameters, symmetry, and space groups to characterize the structure. However, in reality, atoms in material always vibrate above absolute zero, causing continuous fluctuati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12984v4-abstract-full').style.display = 'inline'; document.getElementById('2408.12984v4-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.12984v4-abstract-full" style="display: none;"> The crystal structure can be simplified as a periodic point set repeating across the entire three-dimensional space along an underlying lattice. Traditionally, methods for representing crystals rely on descriptors like lattice parameters, symmetry, and space groups to characterize the structure. However, in reality, atoms in material always vibrate above absolute zero, causing continuous fluctuations in their positions. This dynamic behavior disrupts the underlying periodicity of the lattice, making crystal graphs based on static lattice parameters and conventional descriptors discontinuous under even slight perturbations. To this end, chemists proposed the Pairwise Distance Distribution (PDD) method, which has been used to distinguish all periodic structures in the world's largest real materials collection, the Cambridge Structural Database. However, achieving the completeness of PDD requires defining a large number of neighboring atoms, resulting in high computational costs. Moreover, it does not account for atomic information, making it challenging to directly apply PDD to crystal material property prediction tasks. To address these challenges, we propose the atom-Weighted Pairwise Distance Distribution (WPDD) and Unit cell Pairwise Distance Distribution (UPDD) for the first time, incorporating them into the construction of multi-edge crystal graphs. Based on this, we further developed WPDDFormer and UPDDFormer, graph transformer architecture constructed using WPDD and UPDD crystal graphs. We demonstrate that this method maintains the continuity and completeness of crystal graphs even under slight perturbations in atomic positions. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.12984v4-abstract-full').style.display = 'none'; document.getElementById('2408.12984v4-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 3 figures</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Cheng%2C+J&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>