Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 183 results for author: <span class="mathjax">Han, R</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Han%2C+R">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Han, R"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Han%2C+R&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Han, R"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.07980">arXiv:2502.07980</a> <span> [<a href="https://arxiv.org/pdf/2502.07980">pdf</a>, <a href="https://arxiv.org/format/2502.07980">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> CIRCUIT: A Benchmark for Circuit Interpretation and Reasoning Capabilities of LLMs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Skelic%2C+L">Lejla Skelic</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yan Xu</a>, <a href="/search/cs?searchtype=author&query=Cox%2C+M">Matthew Cox</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+W">Wenjie Lu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+T">Tao Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruonan Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.07980v1-abstract-short" style="display: inline;"> The role of Large Language Models (LLMs) has not been extensively explored in analog circuit design, which could benefit from a reasoning-based approach that transcends traditional optimization techniques. In particular, despite their growing relevance, there are no benchmarks to assess LLMs' reasoning capability about circuits. Therefore, we created the CIRCUIT dataset consisting of 510 question-… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07980v1-abstract-full').style.display = 'inline'; document.getElementById('2502.07980v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.07980v1-abstract-full" style="display: none;"> The role of Large Language Models (LLMs) has not been extensively explored in analog circuit design, which could benefit from a reasoning-based approach that transcends traditional optimization techniques. In particular, despite their growing relevance, there are no benchmarks to assess LLMs' reasoning capability about circuits. Therefore, we created the CIRCUIT dataset consisting of 510 question-answer pairs spanning various levels of analog-circuit-related subjects. The best-performing model on our dataset, GPT-4o, achieves 48.04% accuracy when evaluated on the final numerical answer. To evaluate the robustness of LLMs on our dataset, we introduced a unique feature that enables unit-test-like evaluation by grouping questions into unit tests. In this case, GPT-4o can only pass 27.45% of the unit tests, highlighting that the most advanced LLMs still struggle with understanding circuits, which requires multi-level reasoning, particularly when involving circuit topologies. This circuit-specific benchmark highlights LLMs' limitations, offering valuable insights for advancing their application in analog integrated circuit design. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.07980v1-abstract-full').style.display = 'none'; document.getElementById('2502.07980v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2502.00631">arXiv:2502.00631</a> <span> [<a href="https://arxiv.org/pdf/2502.00631">pdf</a>, <a href="https://arxiv.org/format/2502.00631">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> MedConv: Convolutions Beat Transformers on Long-Tailed Bone Density Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+X">Xuyin Qi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+H">Huazhan Zheng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mingxi Chen</a>, <a href="/search/cs?searchtype=author&query=Kutaiba%2C+N">Numan Kutaiba</a>, <a href="/search/cs?searchtype=author&query=Lim%2C+R">Ruth Lim</a>, <a href="/search/cs?searchtype=author&query=Chiang%2C+C">Cherie Chiang</a>, <a href="/search/cs?searchtype=author&query=Tham%2C+Z+E">Zi En Tham</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+X">Xuan Ren</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+W">Wenxin Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Lei Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hao Zhang</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+W">Wenbing Lv</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+G">Guangzhen Yao</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Renda Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kangsheng Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+M">Mingyuan Li</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+H">Hongtao Mao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yu Li</a>, <a href="/search/cs?searchtype=author&query=Liao%2C+Z">Zhibin Liao</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yang Zhao</a>, <a href="/search/cs?searchtype=author&query=To%2C+M">Minh-Son To</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2502.00631v1-abstract-short" style="display: inline;"> Bone density prediction via CT scans to estimate T-scores is crucial, providing a more precise assessment of bone health compared to traditional methods like X-ray bone density tests, which lack spatial resolution and the ability to detect localized changes. However, CT-based prediction faces two major challenges: the high computational complexity of transformer-based architectures, which limits t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00631v1-abstract-full').style.display = 'inline'; document.getElementById('2502.00631v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2502.00631v1-abstract-full" style="display: none;"> Bone density prediction via CT scans to estimate T-scores is crucial, providing a more precise assessment of bone health compared to traditional methods like X-ray bone density tests, which lack spatial resolution and the ability to detect localized changes. However, CT-based prediction faces two major challenges: the high computational complexity of transformer-based architectures, which limits their deployment in portable and clinical settings, and the imbalanced, long-tailed distribution of real-world hospital data that skews predictions. To address these issues, we introduce MedConv, a convolutional model for bone density prediction that outperforms transformer models with lower computational demands. We also adapt Bal-CE loss and post-hoc logit adjustment to improve class balance. Extensive experiments on our AustinSpine dataset shows that our approach achieves up to 21% improvement in accuracy and 20% in ROC AUC over previous state-of-the-art methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2502.00631v1-abstract-full').style.display = 'none'; document.getElementById('2502.00631v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.07585">arXiv:2501.07585</a> <span> [<a href="https://arxiv.org/pdf/2501.07585">pdf</a>, <a href="https://arxiv.org/format/2501.07585">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Multi-task Domain Adaptation for Computation Offloading in Edge-intelligence Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Runxin Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B">Bo Yang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zhiwen Yu</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+X">Xuelin Cao</a>, <a href="/search/cs?searchtype=author&query=Alexandropoulos%2C+G+C">George C. Alexandropoulos</a>, <a href="/search/cs?searchtype=author&query=Yuen%2C+C">Chau Yuen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.07585v1-abstract-short" style="display: inline;"> In the field of multi-access edge computing (MEC), efficient computation offloading is crucial for improving resource utilization and reducing latency in dynamically changing environments. This paper introduces a new approach, termed as Multi-Task Domain Adaptation (MTDA), aiming to enhance the ability of computational offloading models to generalize in the presence of domain shifts, i.e., when ne… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07585v1-abstract-full').style.display = 'inline'; document.getElementById('2501.07585v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.07585v1-abstract-full" style="display: none;"> In the field of multi-access edge computing (MEC), efficient computation offloading is crucial for improving resource utilization and reducing latency in dynamically changing environments. This paper introduces a new approach, termed as Multi-Task Domain Adaptation (MTDA), aiming to enhance the ability of computational offloading models to generalize in the presence of domain shifts, i.e., when new data in the target environment significantly differs from the data in the source domain. The proposed MTDA model incorporates a teacher-student architecture that allows continuous adaptation without necessitating access to the source domain data during inference, thereby maintaining privacy and reducing computational overhead. Utilizing a multi-task learning framework that simultaneously manages offloading decisions and resource allocation, the proposed MTDA approach outperforms benchmark methods regarding mean squared error and accuracy, particularly in environments with increasing numbers of users. It is observed by means of computer simulation that the proposed MTDA model maintains high performance across various scenarios, demonstrating its potential for practical deployment in emerging MEC applications. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.07585v1-abstract-full').style.display = 'none'; document.getElementById('2501.07585v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.03695">arXiv:2501.03695</a> <span> [<a href="https://arxiv.org/pdf/2501.03695">pdf</a>, <a href="https://arxiv.org/format/2501.03695">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> </div> </div> <p class="title is-5 mathjax"> Unraveling Responsiveness of Chained BFT Consensus with Network Delay </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yining Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Q">Qihang Luo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Runchao Han</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.03695v1-abstract-short" style="display: inline;"> With the advancement of blockchain technology, chained Byzantine Fault Tolerant (BFT) protocols have been increasingly adopted in practical systems, making their performance a crucial aspect of the study. In this paper, we introduce a unified framework utilizing Markov Decision Processes (MDP) to model and assess the performance of three prominent chained BFT protocols. Our framework effectively c… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03695v1-abstract-full').style.display = 'inline'; document.getElementById('2501.03695v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.03695v1-abstract-full" style="display: none;"> With the advancement of blockchain technology, chained Byzantine Fault Tolerant (BFT) protocols have been increasingly adopted in practical systems, making their performance a crucial aspect of the study. In this paper, we introduce a unified framework utilizing Markov Decision Processes (MDP) to model and assess the performance of three prominent chained BFT protocols. Our framework effectively captures complex adversarial behaviors, focusing on two key performance metrics: chain growth and commitment rate. We implement the optimal attack strategies obtained from MDP analysis on an existing evaluation platform for chained BFT protocols and conduct extensive experiments under various settings to validate our theoretical results. Through rigorous theoretical analysis and thorough practical experiments, we provide an in-depth evaluation of chained BFT protocols under diverse attack scenarios, uncovering optimal attack strategies. Contrary to conventional belief, our findings reveal that while responsiveness can enhance performance, it is not universally beneficial across all scenarios. This work not only deepens our understanding of chained BFT protocols, but also offers valuable insights and analytical tools that can inform the design of more robust and efficient protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.03695v1-abstract-full').style.display = 'none'; document.getElementById('2501.03695v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2501.02970">arXiv:2501.02970</a> <span> [<a href="https://arxiv.org/pdf/2501.02970">pdf</a>, <a href="https://arxiv.org/format/2501.02970">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Cryptography and Security">cs.CR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> </div> </div> <p class="title is-5 mathjax"> Leader Rotation Is Not Enough: Scrutinizing Leadership Democracy of Chained BFT Consensus </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+Y">Yining Tang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Runchao Han</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+J">Jianyu Niu</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yinqian Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2501.02970v1-abstract-short" style="display: inline;"> With the growing popularity of blockchains, modern chained BFT protocols combining chaining and leader rotation to obtain better efficiency and leadership democracy have received increasing interest. Although the efficiency provisions of chained BFT protocols have been thoroughly analyzed, the leadership democracy has received little attention in prior work. In this paper, we scrutinize the leader… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02970v1-abstract-full').style.display = 'inline'; document.getElementById('2501.02970v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2501.02970v1-abstract-full" style="display: none;"> With the growing popularity of blockchains, modern chained BFT protocols combining chaining and leader rotation to obtain better efficiency and leadership democracy have received increasing interest. Although the efficiency provisions of chained BFT protocols have been thoroughly analyzed, the leadership democracy has received little attention in prior work. In this paper, we scrutinize the leadership democracy of four representative chained BFT protocols, especially under attack. To this end, we propose a unified framework with two evaluation metrics, i.e., chain quality and censorship resilience, and quantitatively analyze chosen protocols through the Markov Decision Process (MDP). With this framework, we further examine the impact of two key components, i.e., voting pattern and leader rotation on leadership democracy. Our results indicate that leader rotation is not enough to provide the leadership democracy guarantee; an adversary could utilize the design, e.g., voting pattern, to deteriorate the leadership democracy significantly. Based on the analysis results, we propose customized countermeasures for three evaluated protocols to improve their leadership democracy with only slight protocol overhead and no change of consensus rules. We also discuss future directions toward building more democratic chained BFT protocols. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2501.02970v1-abstract-full').style.display = 'none'; document.getElementById('2501.02970v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2025. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14222">arXiv:2412.14222</a> <span> [<a href="https://arxiv.org/pdf/2412.14222">pdf</a>, <a href="https://arxiv.org/format/2412.14222">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Other Statistics">stat.OT</span> </div> </div> <p class="title is-5 mathjax"> A Survey on Large Language Model-based Agents for Statistics and Data Science </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maojun Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruijian Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Binyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Houduo Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Defeng Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14222v1-abstract-short" style="display: inline;"> In recent years, data science agents powered by Large Language Models (LLMs), known as "data agents," have shown significant potential to transform the traditional data analysis paradigm. This survey provides an overview of the evolution, capabilities, and applications of LLM-based data agents, highlighting their role in simplifying complex data tasks and lowering the entry barrier for users witho… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14222v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14222v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14222v1-abstract-full" style="display: none;"> In recent years, data science agents powered by Large Language Models (LLMs), known as "data agents," have shown significant potential to transform the traditional data analysis paradigm. This survey provides an overview of the evolution, capabilities, and applications of LLM-based data agents, highlighting their role in simplifying complex data tasks and lowering the entry barrier for users without related expertise. We explore current trends in the design of LLM-based frameworks, detailing essential features such as planning, reasoning, reflection, multi-agent collaboration, user interface, knowledge integration, and system design, which enable agents to address data-centric problems with minimal human intervention. Furthermore, we analyze several case studies to demonstrate the practical applications of various data agents in real-world scenarios. Finally, we identify key challenges and propose future research directions to advance the development of data agents into intelligent statistical analysis software. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14222v1-abstract-full').style.display = 'none'; document.getElementById('2412.14222v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.14171">arXiv:2412.14171</a> <span> [<a href="https://arxiv.org/pdf/2412.14171">pdf</a>, <a href="https://arxiv.org/format/2412.14171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Thinking in Space: How Multimodal Large Language Models See, Remember, and Recall Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jihan Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shusheng Yang</a>, <a href="/search/cs?searchtype=author&query=Gupta%2C+A+W">Anjali W. Gupta</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rilyn Han</a>, <a href="/search/cs?searchtype=author&query=Fei-Fei%2C+L">Li Fei-Fei</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+S">Saining Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.14171v1-abstract-short" style="display: inline;"> Humans possess the visual-spatial intelligence to remember spaces from sequential visual observations. However, can Multimodal Large Language Models (MLLMs) trained on million-scale video datasets also ``think in space'' from videos? We present a novel video-based visual-spatial intelligence benchmark (VSI-Bench) of over 5,000 question-answer pairs, and find that MLLMs exhibit competitive - though… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14171v1-abstract-full').style.display = 'inline'; document.getElementById('2412.14171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.14171v1-abstract-full" style="display: none;"> Humans possess the visual-spatial intelligence to remember spaces from sequential visual observations. However, can Multimodal Large Language Models (MLLMs) trained on million-scale video datasets also ``think in space'' from videos? We present a novel video-based visual-spatial intelligence benchmark (VSI-Bench) of over 5,000 question-answer pairs, and find that MLLMs exhibit competitive - though subhuman - visual-spatial intelligence. We probe models to express how they think in space both linguistically and visually and find that while spatial reasoning capabilities remain the primary bottleneck for MLLMs to reach higher benchmark performance, local world models and spatial awareness do emerge within these models. Notably, prevailing linguistic reasoning techniques (e.g., chain-of-thought, self-consistency, tree-of-thoughts) fail to improve performance, whereas explicitly generating cognitive maps during question-answering enhances MLLMs' spatial distance ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.14171v1-abstract-full').style.display = 'none'; document.getElementById('2412.14171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project page: https://vision-x-nyu.github.io/thinking-in-space.github.io/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.09727">arXiv:2412.09727</a> <span> [<a href="https://arxiv.org/pdf/2412.09727">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantitative Methods">q-bio.QM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Let Curves Speak: A Continuous Glucose Monitor based Large Sensor Foundation Model for Diabetes Management </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+J">Junjie Luo</a>, <a href="/search/cs?searchtype=author&query=Kumbara%2C+A">Abhimanyu Kumbara</a>, <a href="/search/cs?searchtype=author&query=Shomali%2C+M">Mansur Shomali</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rui Han</a>, <a href="/search/cs?searchtype=author&query=Iyer%2C+A">Anand Iyer</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Ritu Agarwal</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+G">Gordon Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.09727v2-abstract-short" style="display: inline;"> While previous studies of AI in diabetes management focus on long-term risk, research on near-future glucose prediction remains limited but important as it enables timely diabetes self-management. Integrating AI with continuous glucose monitoring (CGM) holds promise for near-future glucose prediction. However, existing models have limitations in capturing patterns of blood glucose fluctuations and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09727v2-abstract-full').style.display = 'inline'; document.getElementById('2412.09727v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.09727v2-abstract-full" style="display: none;"> While previous studies of AI in diabetes management focus on long-term risk, research on near-future glucose prediction remains limited but important as it enables timely diabetes self-management. Integrating AI with continuous glucose monitoring (CGM) holds promise for near-future glucose prediction. However, existing models have limitations in capturing patterns of blood glucose fluctuations and demonstrate poor generalizability. A robust approach is needed to leverage massive CGM data for near-future glucose prediction. We propose large sensor models (LSMs) to capture knowledge in CGM data by modeling patients as sequences of glucose. CGM-LSM is pretrained on 15.96 million glucose records from 592 diabetes patients for near-future glucose prediction. We evaluated CGM-LSM against state-of-the-art methods using the OhioT1DM dataset across various metrics, prediction horizons, and unseen patients. Additionally, we assessed its generalizability across factors like diabetes type, age, gender, and hour of day. CGM-LSM achieved exceptional performance, with an rMSE of 29.81 mg/dL for type 1 diabetes patients and 23.49 mg/dL for type 2 diabetes patients in a two-hour prediction horizon. For the OhioT1DM dataset, CGM-LSM achieved a one-hour rMSE of 15.64 mg/dL, halving the previous best of 31.97 mg/dL. Robustness analyses revealed consistent performance not only for unseen patients and future periods, but also across diabetes type, age, and gender. The model demonstrated adaptability to different hours of day, maintaining accuracy across periods of various activity intensity levels. CGM-LSM represents a transformative step in diabetes management by leveraging pretraining to uncover latent glucose generation patterns in sensor data. Our findings also underscore the broader potential of LSMs to drive innovation across domains involving complex sensor data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.09727v2-abstract-full').style.display = 'none'; document.getElementById('2412.09727v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 12 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2412.00386">arXiv:2412.00386</a> <span> [<a href="https://arxiv.org/pdf/2412.00386">pdf</a>, <a href="https://arxiv.org/format/2412.00386">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Strategic Application of AIGC for UAV Trajectory Design: A Channel Knowledge Map Approach </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chiya Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+T">Ting Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rubing Han</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yuanxiang Gong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2412.00386v1-abstract-short" style="display: inline;"> Unmanned Aerial Vehicles (UAVs) are increasingly utilized in wireless communication, yet accurate channel loss prediction remains a significant challenge, limiting resource optimization performance. To address this issue, this paper leverages Artificial Intelligence Generated Content (AIGC) for the efficient construction of Channel Knowledge Maps (CKM) and UAV trajectory design. Given the time-con… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00386v1-abstract-full').style.display = 'inline'; document.getElementById('2412.00386v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2412.00386v1-abstract-full" style="display: none;"> Unmanned Aerial Vehicles (UAVs) are increasingly utilized in wireless communication, yet accurate channel loss prediction remains a significant challenge, limiting resource optimization performance. To address this issue, this paper leverages Artificial Intelligence Generated Content (AIGC) for the efficient construction of Channel Knowledge Maps (CKM) and UAV trajectory design. Given the time-consuming nature of channel data collection, AI techniques are employed in a Wasserstein Generative Adversarial Network (WGAN) to extract environmental features and augment the data. Experiment results demonstrate the effectiveness of the proposed framework in improving CKM construction accuracy. Moreover, integrating CKM into UAV trajectory planning reduces channel gain uncertainty, demonstrating its potential to enhance wireless communication efficiency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2412.00386v1-abstract-full').style.display = 'none'; document.getElementById('2412.00386v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> December 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.19865">arXiv:2411.19865</a> <span> [<a href="https://arxiv.org/pdf/2411.19865">pdf</a>, <a href="https://arxiv.org/format/2411.19865">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Reverse Thinking Makes LLMs Stronger Reasoners </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J+C">Justin Chih-Yao Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Palangi%2C+H">Hamid Palangi</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rujun Han</a>, <a href="/search/cs?searchtype=author&query=Ebrahimi%2C+S">Sayna Ebrahimi</a>, <a href="/search/cs?searchtype=author&query=Le%2C+L">Long Le</a>, <a href="/search/cs?searchtype=author&query=Perot%2C+V">Vincent Perot</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+S">Swaroop Mishra</a>, <a href="/search/cs?searchtype=author&query=Bansal%2C+M">Mohit Bansal</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chen-Yu Lee</a>, <a href="/search/cs?searchtype=author&query=Pfister%2C+T">Tomas Pfister</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.19865v1-abstract-short" style="display: inline;"> Reverse thinking plays a crucial role in human reasoning. Humans can reason not only from a problem to a solution but also in reverse, i.e., start from the solution and reason towards the problem. This often enhances overall reasoning performance as it enables consistency checks between their forward and backward thinking. To enable Large Language Models (LLMs) to perform reverse thinking, we intr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19865v1-abstract-full').style.display = 'inline'; document.getElementById('2411.19865v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.19865v1-abstract-full" style="display: none;"> Reverse thinking plays a crucial role in human reasoning. Humans can reason not only from a problem to a solution but also in reverse, i.e., start from the solution and reason towards the problem. This often enhances overall reasoning performance as it enables consistency checks between their forward and backward thinking. To enable Large Language Models (LLMs) to perform reverse thinking, we introduce Reverse-Enhanced Thinking (RevThink), a framework composed of data augmentation and learning objectives. In RevThink, we augment the dataset by collecting structured forward-backward reasoning from a teacher model, consisting of: (1) the original question, (2) forward reasoning, (3) backward question, and (4) backward reasoning. We then employ three objectives to train a smaller student model in a multi-task learning fashion: (a) generate forward reasoning from a question, (b) generate a backward question from a question, and (c) generate backward reasoning from the backward question. Experiments across 12 datasets covering commonsense, math, and logical reasoning show an average 13.53% improvement over the student model's zero-shot performance and a 6.84% improvement over the strongest knowledge distillation baselines. Moreover, our method demonstrates sample efficiency -- using only 10% of the correct forward reasoning from the training data, it outperforms a standard fine-tuning method trained on 10x more forward reasoning. RevThink also exhibits strong generalization to out-of-distribution held-out datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.19865v1-abstract-full').style.display = 'none'; document.getElementById('2411.19865v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17083">arXiv:2411.17083</a> <span> [<a href="https://arxiv.org/pdf/2411.17083">pdf</a>, <a href="https://arxiv.org/format/2411.17083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Fluid Dynamics">physics.flu-dyn</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Geophysics">physics.geo-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Instrumentation and Detectors">physics.ins-det</span> </div> </div> <p class="title is-5 mathjax"> A Haptic-Based Proximity Sensing System for Buried Object in Granular Material </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+R">Ruixing Jia</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Youcan Yan</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruihua Han</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shijie Lin</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Q">Qian Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+L">Liangjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17083v1-abstract-short" style="display: inline;"> The proximity perception of objects in granular materials is significant, especially for applications like minesweeping. However, due to particles' opacity and complex properties, existing proximity sensors suffer from high costs from sophisticated hardware and high user-cost from unintuitive results. In this paper, we propose a simple yet effective proximity sensing system for underground stuff b… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17083v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17083v1-abstract-full" style="display: none;"> The proximity perception of objects in granular materials is significant, especially for applications like minesweeping. However, due to particles' opacity and complex properties, existing proximity sensors suffer from high costs from sophisticated hardware and high user-cost from unintuitive results. In this paper, we propose a simple yet effective proximity sensing system for underground stuff based on the haptic feedback of the sensor-granules interaction. We study and employ the unique characteristic of particles -- failure wedge zone, and combine the machine learning method -- Gaussian process regression, to identify the force signal changes induced by the proximity of objects, so as to achieve near-field perception. Furthermore, we design a novel trajectory to control the probe searching in granules for a wide range of perception. Also, our proximity sensing system can adaptively determine optimal parameters for robustness operation in different particles. Experiments demonstrate our system can perceive underground objects over 0.5 to 7 cm in advance among various materials. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17083v1-abstract-full').style.display = 'none'; document.getElementById('2411.17083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">The 40th International Symposium of Robotics Research (ISRR). Long Beach, California, USA, December 8-12 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11340">arXiv:2411.11340</a> <span> [<a href="https://arxiv.org/pdf/2411.11340">pdf</a>, <a href="https://arxiv.org/format/2411.11340">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> A Hybrid Loss Framework for Decomposition-based Time Series Forecasting Methods: Balancing Global and Component Errors </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ronghui Han</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+D">Duanyu Feng</a>, <a href="/search/cs?searchtype=author&query=Du%2C+H">Hongyu Du</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hao Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11340v1-abstract-short" style="display: inline;"> Accurate time series forecasting, predicting future values based on past data, is crucial for diverse industries. Many current time series methods decompose time series into multiple sub-series, applying different model architectures and training with an end-to-end overall loss for forecasting. However, this raises a question: does this overall loss prioritize the importance of critical sub-series… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11340v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11340v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11340v1-abstract-full" style="display: none;"> Accurate time series forecasting, predicting future values based on past data, is crucial for diverse industries. Many current time series methods decompose time series into multiple sub-series, applying different model architectures and training with an end-to-end overall loss for forecasting. However, this raises a question: does this overall loss prioritize the importance of critical sub-series within the decomposition for the better performance? To investigate this, we conduct a study on the impact of overall loss on existing time series methods with sequence decomposition. Our findings reveal that overall loss may introduce bias in model learning, hindering the learning of the prioritization of more significant sub-series and limiting the forecasting performance. To address this, we propose a hybrid loss framework combining the global and component losses. This framework introduces component losses for each sub-series alongside the original overall loss. It employs a dual min-max algorithm to dynamically adjust weights between the overall loss and component losses, and within component losses. This enables the model to achieve better performance of current time series methods by focusing on more critical sub-series while still maintaining a low overall loss. We integrate our loss framework into several time series methods and evaluate the performance on multiple datasets. Results show an average improvement of 0.5-2% over existing methods without any modifications to the model architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11340v1-abstract-full').style.display = 'none'; document.getElementById('2411.11340v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02003">arXiv:2411.02003</a> <span> [<a href="https://arxiv.org/pdf/2411.02003">pdf</a>, <a href="https://arxiv.org/format/2411.02003">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Against Multifaceted Graph Heterogeneity via Asymmetric Federated Prompt Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+Z">Zhuoning Guo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruiqian Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hao Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02003v1-abstract-short" style="display: inline;"> Federated Graph Learning (FGL) aims to collaboratively and privately optimize graph models on divergent data for different tasks. A critical challenge in FGL is to enable effective yet efficient federated optimization against multifaceted graph heterogeneity to enhance mutual performance. However, existing FGL works primarily address graph data heterogeneity and perform incapable of graph task het… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02003v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02003v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02003v1-abstract-full" style="display: none;"> Federated Graph Learning (FGL) aims to collaboratively and privately optimize graph models on divergent data for different tasks. A critical challenge in FGL is to enable effective yet efficient federated optimization against multifaceted graph heterogeneity to enhance mutual performance. However, existing FGL works primarily address graph data heterogeneity and perform incapable of graph task heterogeneity. To address the challenge, we propose a Federated Graph Prompt Learning (FedGPL) framework to efficiently enable prompt-based asymmetric graph knowledge transfer between multifaceted heterogeneous federated participants. Generally, we establish a split federated framework to preserve universal and domain-specific graph knowledge, respectively. Moreover, we develop two algorithms to eliminate task and data heterogeneity for advanced federated knowledge preservation. First, a Hierarchical Directed Transfer Aggregator (HiDTA) delivers cross-task beneficial knowledge that is hierarchically distilled according to the directional transferability. Second, a Virtual Prompt Graph (VPG) adaptively generates graph structures to enhance data utility by distinguishing dominant subgraphs and neutralizing redundant ones. We conduct theoretical analyses and extensive experiments to demonstrate the significant accuracy and efficiency effectiveness of FedGPL against multifaceted graph heterogeneity compared to state-of-the-art baselines on large-scale federated graph datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02003v1-abstract-full').style.display = 'none'; document.getElementById('2411.02003v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20627">arXiv:2410.20627</a> <span> [<a href="https://arxiv.org/pdf/2410.20627">pdf</a>, <a href="https://arxiv.org/format/2410.20627">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> </div> </div> <p class="title is-5 mathjax"> DHPrep: Deep Hawkes Process based Dynamic Network Representation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruixuan Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Hongxiang Li</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+B">Bin Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20627v1-abstract-short" style="display: inline;"> Networks representation aims to encode vertices into a low-dimensional space, while preserving the original network structures and properties. Most existing methods focus on static network structure without considering temporal dynamics. However, in real world, most networks (e.g., social and biological networks) are dynamic in nature and are constantly evolving over time. Such temporal dynamics a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20627v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20627v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20627v1-abstract-full" style="display: none;"> Networks representation aims to encode vertices into a low-dimensional space, while preserving the original network structures and properties. Most existing methods focus on static network structure without considering temporal dynamics. However, in real world, most networks (e.g., social and biological networks) are dynamic in nature and are constantly evolving over time. Such temporal dynamics are critical in representations learning, especially for predicting dynamic networks behaviors. To this end, a Deep Hawkes Process based Dynamic Networks Representation algorithm (DHPrep) is proposed in this paper, which is capable of capturing temporal dynamics of dynamic networks. Specifically, DHPrep incorporates both structural information and temporal dynamics to learn vertices representations that can model the edge formation process for a vertex pair, where the structural information is used to capture the historical impact from their neighborhood, and the temporal dynamics utilize this historical information and apply Hawkes point process to model the edges formation process. Moreover, a temporal smoother is further imposed to ensure the representations evolve smoothly over time. To evaluate the effectiveness of DHPrep, extensive experiments are carried out using four real-world datasets. Experimental results reveal that our DHPrep algorithm outperforms state-of-the-art baseline methods in various tasks including link prediction and vertices recommendation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20627v1-abstract-full').style.display = 'none'; document.getElementById('2410.20627v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20599">arXiv:2410.20599</a> <span> [<a href="https://arxiv.org/pdf/2410.20599">pdf</a>, <a href="https://arxiv.org/format/2410.20599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICST59744.2023.10460820">10.1109/ICST59744.2023.10460820 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Sensor Fusion for Autonomous Indoor UAV Navigation in Confined Spaces </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=James%2C+A">Alice James</a>, <a href="/search/cs?searchtype=author&query=Seth%2C+A">Avishkar Seth</a>, <a href="/search/cs?searchtype=author&query=Kuantama%2C+E">Endrowednes Kuantama</a>, <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+S">Subhas Mukhopadhyay</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Richard Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20599v1-abstract-short" style="display: inline;"> In this paper, we address the challenge of navigating through unknown indoor environments using autonomous aerial robots within confined spaces. The core of our system involves the integration of key sensor technologies, including depth sensing from the ZED 2i camera, IMU data, and LiDAR measurements, facilitated by the Robot Operating System (ROS) and RTAB-Map. Through custom designed experiments… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20599v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20599v1-abstract-full" style="display: none;"> In this paper, we address the challenge of navigating through unknown indoor environments using autonomous aerial robots within confined spaces. The core of our system involves the integration of key sensor technologies, including depth sensing from the ZED 2i camera, IMU data, and LiDAR measurements, facilitated by the Robot Operating System (ROS) and RTAB-Map. Through custom designed experiments, we demonstrate the robustness and effectiveness of this approach. Our results showcase a promising navigation accuracy, with errors as low as 0.4 meters, and mapping quality characterized by a Root Mean Square Error (RMSE) of just 0.13 m. Notably, this performance is achieved while maintaining energy efficiency and balanced resource allocation, addressing a crucial concern in UAV applications. Flight tests further underscore the precision of our system in maintaining desired flight orientations, with a remarkable error rate of only 0.1%. This work represents a significant stride in the development of autonomous indoor UAV navigation systems, with potential applications in search and rescue, facility inspection, and environmental monitoring within GPS-denied indoor environments. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20599v1-abstract-full').style.display = 'none'; document.getElementById('2410.20599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20584">arXiv:2410.20584</a> <span> [<a href="https://arxiv.org/pdf/2410.20584">pdf</a>, <a href="https://arxiv.org/format/2410.20584">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/ICST59744.2023.10460847">10.1109/ICST59744.2023.10460847 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Aerodynamics and Sensing Analysis for Efficient Drone-Based Parcel Delivery </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seth%2C+A">Avishkar Seth</a>, <a href="/search/cs?searchtype=author&query=James%2C+A">Alice James</a>, <a href="/search/cs?searchtype=author&query=Kuantama%2C+E">Endrowednes Kuantama</a>, <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+S">Subhas Mukhopadhyay</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Richard Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20584v1-abstract-short" style="display: inline;"> In an era of rapid urbanization and e-commerce growth, efficient parcel delivery methods are crucial. This paper presents a detailed study of the aerodynamics and sensing analysis of drones for parcel delivery. Utilizing Computational Fluid Dynamics (CFD), the study offers a comprehensive airflow analysis, revealing the aerodynamic forces affecting drone stability due to payload capacity. A multid… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20584v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20584v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20584v1-abstract-full" style="display: none;"> In an era of rapid urbanization and e-commerce growth, efficient parcel delivery methods are crucial. This paper presents a detailed study of the aerodynamics and sensing analysis of drones for parcel delivery. Utilizing Computational Fluid Dynamics (CFD), the study offers a comprehensive airflow analysis, revealing the aerodynamic forces affecting drone stability due to payload capacity. A multidisciplinary approach is employed, integrating mechanical design, control theory, and sensing systems to address the complex issue of parcel positioning. The experimental validation section rigorously tests different size payloads and their positions and impact on drones with maximum thrusts of 2000 gf. The findings prove the drone's capacity to lift a large payload that covers up to 50 percent of the propeller, thereby contributing to optimizing drone designs and sustainable parcel delivery systems. It has been observed that the drone can lift a large payload smoothly when placed above the drone, with an error rate as low as 0.1 percent for roll, pitch, and yaw. This work paved the way for more versatile, real-world applications of drone technology, setting a new standard in the field. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20584v1-abstract-full').style.display = 'none'; document.getElementById('2410.20584v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6 pages, 9 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19176">arXiv:2410.19176</a> <span> [<a href="https://arxiv.org/pdf/2410.19176">pdf</a>, <a href="https://arxiv.org/format/2410.19176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Perturbation-based Graph Active Learning for Weakly-Supervised Belief Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+D">Dachun Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruijie Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jinning Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruipeng Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xinyi Liu</a>, <a href="/search/cs?searchtype=author&query=Lyu%2C+Y">You Lyu</a>, <a href="/search/cs?searchtype=author&query=Abdelzaher%2C+T">Tarek Abdelzaher</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19176v1-abstract-short" style="display: inline;"> This paper addresses the problem of optimizing the allocation of labeling resources for semi-supervised belief representation learning in social networks. The objective is to strategically identify valuable messages on social media graphs that are worth labeling within a constrained budget, ultimately maximizing the task's performance. Despite the progress in unsupervised or semi-supervised method… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19176v1-abstract-full').style.display = 'inline'; document.getElementById('2410.19176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19176v1-abstract-full" style="display: none;"> This paper addresses the problem of optimizing the allocation of labeling resources for semi-supervised belief representation learning in social networks. The objective is to strategically identify valuable messages on social media graphs that are worth labeling within a constrained budget, ultimately maximizing the task's performance. Despite the progress in unsupervised or semi-supervised methods in advancing belief and ideology representation learning on social networks and the remarkable efficacy of graph learning techniques, the availability of high-quality curated labeled social data can greatly benefit and further improve performances. Consequently, allocating labeling efforts is a critical research problem in scenarios where labeling resources are limited. This paper proposes a graph data augmentation-inspired perturbation-based active learning strategy (PerbALGraph) that progressively selects messages for labeling according to an automatic estimator, obviating human guidance. This estimator is based on the principle that messages in the network that exhibit heightened sensitivity to structural features of the observational data indicate landmark quality that significantly influences semi-supervision processes. We design the estimator to be the prediction variance under a set of designed graph perturbations, which is model-agnostic and application-independent. Extensive experiment results demonstrate the effectiveness of the proposed strategy for belief representation learning tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19176v1-abstract-full').style.display = 'none'; document.getElementById('2410.19176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17534">arXiv:2410.17534</a> <span> [<a href="https://arxiv.org/pdf/2410.17534">pdf</a>, <a href="https://arxiv.org/format/2410.17534">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> OVT-B: A New Large-Scale Benchmark for Open-Vocabulary Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+H">Haiji Liang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17534v1-abstract-short" style="display: inline;"> Open-vocabulary object perception has become an important topic in artificial intelligence, which aims to identify objects with novel classes that have not been seen during training. Under this setting, open-vocabulary object detection (OVD) in a single image has been studied in many literature. However, open-vocabulary object tracking (OVT) from a video has been studied less, and one reason is th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17534v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17534v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17534v1-abstract-full" style="display: none;"> Open-vocabulary object perception has become an important topic in artificial intelligence, which aims to identify objects with novel classes that have not been seen during training. Under this setting, open-vocabulary object detection (OVD) in a single image has been studied in many literature. However, open-vocabulary object tracking (OVT) from a video has been studied less, and one reason is the shortage of benchmarks. In this work, we have built a new large-scale benchmark for open-vocabulary multi-object tracking namely OVT-B. OVT-B contains 1,048 categories of objects and 1,973 videos with 637,608 bounding box annotations, which is much larger than the sole open-vocabulary tracking dataset, i.e., OVTAO-val dataset (200+ categories, 900+ videos). The proposed OVT-B can be used as a new benchmark to pave the way for OVT research. We also develop a simple yet effective baseline method for OVT. It integrates the motion features for object tracking, which is an important feature for MOT but is ignored in previous OVT methods. Experimental results have verified the usefulness of the proposed benchmark and the effectiveness of our method. We have released the benchmark to the public at https://github.com/Coo1Sea/OVT-B-Dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17534v1-abstract-full').style.display = 'none'; document.getElementById('2410.17534v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">15 pages, 6 figures, accepted at NeurIPS 2024 Dataset and Benchmark Track</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.12694">arXiv:2410.12694</a> <span> [<a href="https://arxiv.org/pdf/2410.12694">pdf</a>, <a href="https://arxiv.org/format/2410.12694">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> VividMed: Vision Language Model with Versatile Visual Grounding for Medicine </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lingxiao Luo</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+B">Bingda Tang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanzhong Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rong Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.12694v1-abstract-short" style="display: inline;"> Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable promise in generating visually grounded responses. However, their application in the medical domain is hindered by unique challenges. For instance, most VLMs rely on a single method of visual grounding, whereas complex medical tasks demand more versatile approaches. Additionally, while most VLMs process only 2D image… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12694v1-abstract-full').style.display = 'inline'; document.getElementById('2410.12694v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.12694v1-abstract-full" style="display: none;"> Recent advancements in Vision Language Models (VLMs) have demonstrated remarkable promise in generating visually grounded responses. However, their application in the medical domain is hindered by unique challenges. For instance, most VLMs rely on a single method of visual grounding, whereas complex medical tasks demand more versatile approaches. Additionally, while most VLMs process only 2D images, a large portion of medical images are 3D. The lack of medical data further compounds these obstacles. To address these challenges, we present VividMed, a vision language model with versatile visual grounding for medicine. Our model supports generating both semantic segmentation masks and instance-level bounding boxes, and accommodates various imaging modalities, including both 2D and 3D data. We design a three-stage training procedure and an automatic data synthesis pipeline based on open datasets and models. Besides visual grounding tasks, VividMed also excels in other common downstream tasks, including Visual Question Answering (VQA) and report generation. Ablation studies empirically show that the integration of visual grounding ability leads to improved performance on these tasks. Our code is publicly available at https://github.com/function2-llx/MMMM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.12694v1-abstract-full').style.display = 'none'; document.getElementById('2410.12694v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.11325">arXiv:2410.11325</a> <span> [<a href="https://arxiv.org/pdf/2410.11325">pdf</a>, <a href="https://arxiv.org/format/2410.11325">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Speculative Knowledge Distillation: Bridging the Teacher-Student Gap Through Interleaved Sampling </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wenda Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rujun Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zifeng Wang</a>, <a href="/search/cs?searchtype=author&query=Le%2C+L+T">Long T. Le</a>, <a href="/search/cs?searchtype=author&query=Madeka%2C+D">Dhruv Madeka</a>, <a href="/search/cs?searchtype=author&query=Li%2C+L">Lei Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Agarwal%2C+R">Rishabh Agarwal</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+C">Chen-Yu Lee</a>, <a href="/search/cs?searchtype=author&query=Pfister%2C+T">Tomas Pfister</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.11325v1-abstract-short" style="display: inline;"> Recent advances in knowledge distillation (KD) have enabled smaller student models to approach the performance of larger teacher models. However, popular methods such as supervised KD and on-policy KD, are adversely impacted by the knowledge gaps between teacher-student in practical scenarios. Supervised KD suffers from a distribution mismatch between training with a static dataset and inference o… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11325v1-abstract-full').style.display = 'inline'; document.getElementById('2410.11325v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.11325v1-abstract-full" style="display: none;"> Recent advances in knowledge distillation (KD) have enabled smaller student models to approach the performance of larger teacher models. However, popular methods such as supervised KD and on-policy KD, are adversely impacted by the knowledge gaps between teacher-student in practical scenarios. Supervised KD suffers from a distribution mismatch between training with a static dataset and inference over final student-generated outputs. Conversely, on-policy KD, which uses student-generated samples for training, can suffer from low-quality training examples with which teacher models are not familiar, resulting in inaccurate teacher feedback. To address these limitations, we introduce Speculative Knowledge Distillation (SKD), a novel approach that leverages cooperation between student and teacher models to generate high-quality training data on-the-fly while aligning with the student's inference-time distribution. In SKD, the student proposes tokens, and the teacher replaces poorly ranked ones based on its own distribution, transferring high-quality knowledge adaptively. We evaluate SKD on various text generation tasks, including translation, summarization, math, and instruction following, and show that SKD consistently outperforms existing KD methods across different domains, data sizes, and model initialization strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.11325v1-abstract-full').style.display = 'none'; document.getElementById('2410.11325v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.10646">arXiv:2410.10646</a> <span> [<a href="https://arxiv.org/pdf/2410.10646">pdf</a>, <a href="https://arxiv.org/format/2410.10646">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> DR-MPC: Deep Residual Model Predictive Control for Real-world Social Navigation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+J+R">James R. Han</a>, <a href="/search/cs?searchtype=author&query=Thomas%2C+H">Hugues Thomas</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jian Zhang</a>, <a href="/search/cs?searchtype=author&query=Rhinehart%2C+N">Nicholas Rhinehart</a>, <a href="/search/cs?searchtype=author&query=Barfoot%2C+T+D">Timothy D. Barfoot</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.10646v2-abstract-short" style="display: inline;"> How can a robot safely navigate around people with complex motion patterns? Deep Reinforcement Learning (DRL) in simulation holds some promise, but much prior work relies on simulators that fail to capture the nuances of real human motion. Thus, we propose Deep Residual Model Predictive Control (DR-MPC) to enable robots to quickly and safely perform DRL from real-world crowd navigation data. By bl… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10646v2-abstract-full').style.display = 'inline'; document.getElementById('2410.10646v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.10646v2-abstract-full" style="display: none;"> How can a robot safely navigate around people with complex motion patterns? Deep Reinforcement Learning (DRL) in simulation holds some promise, but much prior work relies on simulators that fail to capture the nuances of real human motion. Thus, we propose Deep Residual Model Predictive Control (DR-MPC) to enable robots to quickly and safely perform DRL from real-world crowd navigation data. By blending MPC with model-free DRL, DR-MPC overcomes the DRL challenges of large data requirements and unsafe initial behavior. DR-MPC is initialized with MPC-based path tracking, and gradually learns to interact more effectively with humans. To further accelerate learning, a safety component estimates out-of-distribution states to guide the robot away from likely collisions. In simulation, we show that DR-MPC substantially outperforms prior work, including traditional DRL and residual DRL models. Hardware experiments show our approach successfully enables a robot to navigate a variety of crowded situations with few errors using less than 4 hours of training data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.10646v2-abstract-full').style.display = 'none'; document.getElementById('2410.10646v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 14 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 8 figures, accepted to IEEE Robotics and Automation Letters (RA-L) February 2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.08529">arXiv:2410.08529</a> <span> [<a href="https://arxiv.org/pdf/2410.08529">pdf</a>, <a href="https://arxiv.org/format/2410.08529">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> VOVTrack: Exploring the Potentiality in Videos for Open-Vocabulary Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zekun Qian</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.08529v1-abstract-short" style="display: inline;"> Open-vocabulary multi-object tracking (OVMOT) represents a critical new challenge involving the detection and tracking of diverse object categories in videos, encompassing both seen categories (base classes) and unseen categories (novel classes). This issue amalgamates the complexities of open-vocabulary object detection (OVD) and multi-object tracking (MOT). Existing approaches to OVMOT often mer… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08529v1-abstract-full').style.display = 'inline'; document.getElementById('2410.08529v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.08529v1-abstract-full" style="display: none;"> Open-vocabulary multi-object tracking (OVMOT) represents a critical new challenge involving the detection and tracking of diverse object categories in videos, encompassing both seen categories (base classes) and unseen categories (novel classes). This issue amalgamates the complexities of open-vocabulary object detection (OVD) and multi-object tracking (MOT). Existing approaches to OVMOT often merge OVD and MOT methodologies as separate modules, predominantly focusing on the problem through an image-centric lens. In this paper, we propose VOVTrack, a novel method that integrates object states relevant to MOT and video-centric training to address this challenge from a video object tracking standpoint. First, we consider the tracking-related state of the objects during tracking and propose a new prompt-guided attention mechanism for more accurate localization and classification (detection) of the time-varying objects. Subsequently, we leverage raw video data without annotations for training by formulating a self-supervised object similarity learning technique to facilitate temporal object association (tracking). Experimental results underscore that VOVTrack outperforms existing methods, establishing itself as a state-of-the-art solution for open-vocabulary tracking task. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.08529v1-abstract-full').style.display = 'none'; document.getElementById('2410.08529v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.03773">arXiv:2409.03773</a> <span> [<a href="https://arxiv.org/pdf/2409.03773">pdf</a>, <a href="https://arxiv.org/format/2409.03773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> CoPRA: Bridging Cross-domain Pretrained Sequence Models with Complex Structures for Protein-RNA Binding Affinity Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Rong Han</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaohong Liu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tong Pan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jing Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xiaoyu Wang</a>, <a href="/search/cs?searchtype=author&query=Lan%2C+W">Wuyang Lan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+J">Jiangning Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+G">Guangyu Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.03773v2-abstract-short" style="display: inline;"> Accurately measuring protein-RNA binding affinity is crucial in many biological processes and drug design. Previous computational methods for protein-RNA binding affinity prediction rely on either sequence or structure features, unable to capture the binding mechanisms comprehensively. The recent emerging pre-trained language models trained on massive unsupervised sequences of protein and RNA have… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03773v2-abstract-full').style.display = 'inline'; document.getElementById('2409.03773v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.03773v2-abstract-full" style="display: none;"> Accurately measuring protein-RNA binding affinity is crucial in many biological processes and drug design. Previous computational methods for protein-RNA binding affinity prediction rely on either sequence or structure features, unable to capture the binding mechanisms comprehensively. The recent emerging pre-trained language models trained on massive unsupervised sequences of protein and RNA have shown strong representation ability for various in-domain downstream tasks, including binding site prediction. However, applying different-domain language models collaboratively for complex-level tasks remains unexplored. In this paper, we propose CoPRA to bridge pre-trained language models from different biological domains via Complex structure for Protein-RNA binding Affinity prediction. We demonstrate for the first time that cross-biological modal language models can collaborate to improve binding affinity prediction. We propose a Co-Former to combine the cross-modal sequence and structure information and a bi-scope pre-training strategy for improving Co-Former's interaction understanding. Meanwhile, we build the largest protein-RNA binding affinity dataset PRA310 for performance evaluation. We also test our model on a public dataset for mutation effect prediction. CoPRA reaches state-of-the-art performance on all the datasets. We provide extensive analyses and verify that CoPRA can (1) accurately predict the protein-RNA binding affinity; (2) understand the binding affinity change caused by mutations; and (3) benefit from scaling data and model size. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.03773v2-abstract-full').style.display = 'none'; document.getElementById('2409.03773v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 January, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2409.00369">arXiv:2409.00369</a> <span>  </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> An Empirical Study on Information Extraction using Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ridong Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaohao Yang</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+T">Tao Peng</a>, <a href="/search/cs?searchtype=author&query=Tiwari%2C+P">Prayag Tiwari</a>, <a href="/search/cs?searchtype=author&query=Wan%2C+X">Xiang Wan</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lu Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Benyou Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2409.00369v3-abstract-short" style="display: inline;"> Human-like large language models (LLMs), especially the most powerful and popular ones in OpenAI's GPT family, have proven to be very helpful for many natural language processing (NLP) related tasks. Therefore, various attempts have been made to apply LLMs to information extraction (IE), which is a fundamental NLP task that involves extracting information from unstructured plain text. To demonstra… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00369v3-abstract-full').style.display = 'inline'; document.getElementById('2409.00369v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2409.00369v3-abstract-full" style="display: none;"> Human-like large language models (LLMs), especially the most powerful and popular ones in OpenAI's GPT family, have proven to be very helpful for many natural language processing (NLP) related tasks. Therefore, various attempts have been made to apply LLMs to information extraction (IE), which is a fundamental NLP task that involves extracting information from unstructured plain text. To demonstrate the latest representative progress in LLMs' information extraction ability, we assess the information extraction ability of GPT-4 (the latest version of GPT at the time of writing this paper) from four perspectives: Performance, Evaluation Criteria, Robustness, and Error Types. Our results suggest a visible performance gap between GPT-4 and state-of-the-art (SOTA) IE methods. To alleviate this problem, considering the LLMs' human-like characteristics, we propose and analyze the effects of a series of simple prompt-based methods, which can be generalized to other LLMs and NLP tasks. Rich experiments show our methods' effectiveness and some of their remaining issues in improving GPT-4's information extraction ability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2409.00369v3-abstract-full').style.display = 'none'; document.getElementById('2409.00369v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> September 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This submission was intended instead as the replacement of arXiv:2305.14450 , where it now appears as arXiv:2305.14450v2</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.11745">arXiv:2408.11745</a> <span> [<a href="https://arxiv.org/pdf/2408.11745">pdf</a>, <a href="https://arxiv.org/format/2408.11745">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> FocusLLM: Precise Understanding of Long Context by Dynamic Condensing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyu Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yike Zhang</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+T">Tengyu Pan</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yutao Sun</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Z">Zhichao Duan</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+J">Junjie Fang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rong Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jianyong Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.11745v2-abstract-short" style="display: inline;"> Empowering LLMs with the ability to precisely understand long contexts is crucial for many downstream applications. However, handling long contexts with conventional transformer architecture requires substantial training and inference resources. Existing context condensing methods cannot accurately understand the full context, as there is a considerable amount of information loss in the condensing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11745v2-abstract-full').style.display = 'inline'; document.getElementById('2408.11745v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.11745v2-abstract-full" style="display: none;"> Empowering LLMs with the ability to precisely understand long contexts is crucial for many downstream applications. However, handling long contexts with conventional transformer architecture requires substantial training and inference resources. Existing context condensing methods cannot accurately understand the full context, as there is a considerable amount of information loss in the condensing process. To address these issues, we present FocusLLM, a framework designed to extend the fixed context length of any decoder-only LLM, allowing the model to focus on relevant information from very long sequences. FocusLLM first divides long text input into chunks based on the model's original context length. It then employs the dynamic condensing process to distill crucial information from each chunk. Ultimately, through the novel parallel decoding mechanism, FocusLLM can integrate the extracted information into its local context. FocusLLM stands out for great training efficiency and versatility: trained with an 8K input length and with much less training cost than previous methods, FocusLLM exhibits superior performance across downstream tasks and maintains strong language modeling ability when handling extensive long texts, even up to 400K tokens. Our code is available at https://github.com/leezythu/FocusLLM. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.11745v2-abstract-full').style.display = 'none'; document.getElementById('2408.11745v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 December, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 21 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2408.04267">arXiv:2408.04267</a> <span> [<a href="https://arxiv.org/pdf/2408.04267">pdf</a>, <a href="https://arxiv.org/format/2408.04267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Distil-DCCRN: A Small-footprint DCCRN Leveraging Feature-based Knowledge Distillation in Speech Enhancement </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Runduo Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiming Xu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zihan Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+M">Mingshuai Liu</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2408.04267v1-abstract-short" style="display: inline;"> The deep complex convolution recurrent network (DCCRN) achieves excellent speech enhancement performance by utilizing the audio spectrum's complex features. However, it has a large number of model parameters. We propose a smaller model, Distil-DCCRN, which has only 30% of the parameters compared to the DCCRN. To ensure that the performance of Distil-DCCRN matches that of the DCCRN, we employ the k… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04267v1-abstract-full').style.display = 'inline'; document.getElementById('2408.04267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2408.04267v1-abstract-full" style="display: none;"> The deep complex convolution recurrent network (DCCRN) achieves excellent speech enhancement performance by utilizing the audio spectrum's complex features. However, it has a large number of model parameters. We propose a smaller model, Distil-DCCRN, which has only 30% of the parameters compared to the DCCRN. To ensure that the performance of Distil-DCCRN matches that of the DCCRN, we employ the knowledge distillation (KD) method to use a larger teacher model to help train a smaller student model. We design a knowledge distillation (KD) method, integrating attention transfer and Kullback-Leibler divergence (AT-KL) to train the student model Distil-DCCRN. Additionally, we use a model with better performance and a more complicated structure, Uformer, as the teacher model. Unlike previous KD approaches that mainly focus on model outputs, our method also leverages the intermediate features from the models' middle layers, facilitating rich knowledge transfer across different structured models despite variations in layer configurations and discrepancies in the channel and time dimensions of intermediate features. Employing our AT-KL approach, Distil-DCCRN outperforms DCCRN as well as several other competitive models in both PESQ and SI-SNR metrics on the DNS test set and achieves comparable results to DCCRN in DNSMOS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2408.04267v1-abstract-full').style.display = 'none'; document.getElementById('2408.04267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> August 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by IEEE Signal Processing Letters</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.21417">arXiv:2407.21417</a> <span> [<a href="https://arxiv.org/pdf/2407.21417">pdf</a>, <a href="https://arxiv.org/format/2407.21417">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Dancing in Chains: Reconciling Instruction Following and Faithfulness in Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zhengxuan Wu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Peng Qi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yumo Xu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rujun Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yian Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jifan Chen</a>, <a href="/search/cs?searchtype=author&query=Min%2C+B">Bonan Min</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Z">Zhiheng Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.21417v1-abstract-short" style="display: inline;"> Modern language models (LMs) need to follow human instructions while being faithful; yet, they often fail to achieve both. Here, we provide concrete evidence of a trade-off between instruction following (i.e., follow open-ended instructions) and faithfulness (i.e., ground responses in given context) when training LMs with these objectives. For instance, fine-tuning LLaMA-7B on instruction followin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21417v1-abstract-full').style.display = 'inline'; document.getElementById('2407.21417v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.21417v1-abstract-full" style="display: none;"> Modern language models (LMs) need to follow human instructions while being faithful; yet, they often fail to achieve both. Here, we provide concrete evidence of a trade-off between instruction following (i.e., follow open-ended instructions) and faithfulness (i.e., ground responses in given context) when training LMs with these objectives. For instance, fine-tuning LLaMA-7B on instruction following datasets renders it less faithful. Conversely, instruction-tuned Vicuna-7B shows degraded performance at following instructions when further optimized on tasks that require contextual grounding. One common remedy is multi-task learning (MTL) with data mixing, yet it remains far from achieving a synergic outcome. We propose a simple yet effective method that relies on Rejection Sampling for Continued Self-instruction Tuning (ReSet), which significantly outperforms vanilla MTL. Surprisingly, we find that less is more, as training ReSet with high-quality, yet substantially smaller data (three-fold less) yields superior results. Our findings offer a better understanding of objective discrepancies in alignment training of LMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.21417v1-abstract-full').style.display = 'none'; document.getElementById('2407.21417v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">preprint</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19727">arXiv:2407.19727</a> <span> [<a href="https://arxiv.org/pdf/2407.19727">pdf</a>, <a href="https://arxiv.org/format/2407.19727">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Adaptive Utilization of Cross-scenario Information for Multi-scenario Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shu%2C+X">Xiufeng Shu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruidong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19727v1-abstract-short" style="display: inline;"> Recommender system of the e-commerce platform usually serves multiple business scenarios. Multi-scenario Recommendation (MSR) is an important topic that improves ranking performance by leveraging information from different scenarios. Recent methods for MSR mostly construct scenario shared or specific modules to model commonalities and differences among scenarios. However, when the amount of data a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19727v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19727v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19727v1-abstract-full" style="display: none;"> Recommender system of the e-commerce platform usually serves multiple business scenarios. Multi-scenario Recommendation (MSR) is an important topic that improves ranking performance by leveraging information from different scenarios. Recent methods for MSR mostly construct scenario shared or specific modules to model commonalities and differences among scenarios. However, when the amount of data among scenarios is skewed or data in some scenarios is extremely sparse, it is difficult to learn scenario-specific parameters well. Besides, simple sharing of information from other scenarios may result in a negative transfer. In this paper, we propose a unified model named Cross-Scenario Information Interaction (CSII) to serve all scenarios by a mixture of scenario-dominated experts. Specifically, we propose a novel method to select highly transferable features in data instances. Then, we propose an attention-based aggregator module, which can adaptively extract relative knowledge from cross-scenario. Experiments on the production dataset verify the superiority of our method. Online A/B test in Meituan Waimai APP also shows a significant performance gain, leading to an average improvement in GMV (Gross Merchandise Value) of 1.0% for overall scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19727v1-abstract-full').style.display = 'none'; document.getElementById('2407.19727v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.19658">arXiv:2407.19658</a> <span> [<a href="https://arxiv.org/pdf/2407.19658">pdf</a>, <a href="https://arxiv.org/format/2407.19658">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3627673.3679914">10.1145/3627673.3679914 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Enhancing CTR Prediction through Sequential Recommendation Pre-training: Introducing the SRP4CTR Framework </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruidong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Q">Qianzhong Li</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+H">He Jiang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Rui Li</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yurou Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.19658v1-abstract-short" style="display: inline;"> Understanding user interests is crucial for Click-Through Rate (CTR) prediction tasks. In sequential recommendation, pre-training from user historical behaviors through self-supervised learning can better comprehend user dynamic preferences, presenting the potential for direct integration with CTR tasks. Previous methods have integrated pre-trained models into downstream tasks with the sole purpos… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19658v1-abstract-full').style.display = 'inline'; document.getElementById('2407.19658v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.19658v1-abstract-full" style="display: none;"> Understanding user interests is crucial for Click-Through Rate (CTR) prediction tasks. In sequential recommendation, pre-training from user historical behaviors through self-supervised learning can better comprehend user dynamic preferences, presenting the potential for direct integration with CTR tasks. Previous methods have integrated pre-trained models into downstream tasks with the sole purpose of extracting semantic information or well-represented user features, which are then incorporated as new features. However, these approaches tend to ignore the additional inference costs to the downstream tasks, and they do not consider how to transfer the effective information from the pre-trained models for specific estimated items in CTR prediction. In this paper, we propose a Sequential Recommendation Pre-training framework for CTR prediction (SRP4CTR) to tackle the above problems. Initially, we discuss the impact of introducing pre-trained models on inference costs. Subsequently, we introduced a pre-trained method to encode sequence side information concurrently.During the fine-tuning process, we incorporate a cross-attention block to establish a bridge between estimated items and the pre-trained model at a low cost. Moreover, we develop a querying transformer technique to facilitate the knowledge transfer from the pre-trained model to industrial CTR models. Offline and online experiments show that our method outperforms previous baseline models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.19658v1-abstract-full').style.display = 'none'; document.getElementById('2407.19658v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.17535">arXiv:2407.17535</a> <span> [<a href="https://arxiv.org/pdf/2407.17535">pdf</a>, <a href="https://arxiv.org/format/2407.17535">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> LAMBDA: A Large Model Based Data Agent </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maojun Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruijian Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+B">Binyan Jiang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+H">Houduo Qi</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+D">Defeng Sun</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yancheng Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.17535v2-abstract-short" style="display: inline;"> We introduce LArge Model Based Data Agent (LAMBDA), a novel open-source, code-free multi-agent data analysis system that leverages the power of large models. LAMBDA is designed to address data analysis challenges in complex data-driven applications through innovatively designed data agents that operate iteratively and generatively using natural language. At the core of LAMBDA are two key agent rol… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17535v2-abstract-full').style.display = 'inline'; document.getElementById('2407.17535v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.17535v2-abstract-full" style="display: none;"> We introduce LArge Model Based Data Agent (LAMBDA), a novel open-source, code-free multi-agent data analysis system that leverages the power of large models. LAMBDA is designed to address data analysis challenges in complex data-driven applications through innovatively designed data agents that operate iteratively and generatively using natural language. At the core of LAMBDA are two key agent roles: the programmer and the inspector, which are engineered to work together seamlessly. Specifically, the programmer generates code based on the user's instructions and domain-specific knowledge, enhanced by advanced models. Meanwhile, the inspector debugs the code when necessary. To ensure robustness and handle adverse scenarios, LAMBDA features a user interface that allows direct user intervention in the operational loop. Additionally, LAMBDA can flexibly integrate external models and algorithms through our proposed Knowledge Integration Mechanism, catering to the needs of customized data analysis. LAMBDA has demonstrated strong performance on various data analysis tasks. It has the potential to enhance data analysis paradigms by seamlessly integrating human and artificial intelligence, making it more accessible, effective, and efficient for users from diverse backgrounds. The strong performance of LAMBDA in solving data analysis problems is demonstrated using real-world data examples. Videos of several case studies are available at https://xxxlambda.github.io/lambda_webpage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.17535v2-abstract-full').style.display = 'none'; document.getElementById('2407.17535v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 September, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">51 pages, 23 figures and 6 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 62-04; 62-08; 68T01; 68T09 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14330">arXiv:2407.14330</a> <span> [<a href="https://arxiv.org/pdf/2407.14330">pdf</a>, <a href="https://arxiv.org/format/2407.14330">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Straightforward Layer-wise Pruning for More Efficient Visual Adaptation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruizi Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J">Jinglei Tang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14330v1-abstract-short" style="display: inline;"> Parameter-efficient transfer learning (PETL) aims to adapt large pre-trained models using limited parameters. While most PETL approaches update the added parameters and freeze pre-trained weights during training, the minimal impact of task-specific deep layers on cross-domain data poses a challenge as PETL cannot modify them, resulting in redundant model structures. Structural pruning effectively… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14330v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14330v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14330v1-abstract-full" style="display: none;"> Parameter-efficient transfer learning (PETL) aims to adapt large pre-trained models using limited parameters. While most PETL approaches update the added parameters and freeze pre-trained weights during training, the minimal impact of task-specific deep layers on cross-domain data poses a challenge as PETL cannot modify them, resulting in redundant model structures. Structural pruning effectively reduces model redundancy; however, common pruning methods often lead to an excessive increase in stored parameters due to varying pruning structures based on pruning rates and data. Recognizing the storage parameter volume issue, we propose a Straightforward layer-wise pruning method, called SLS, for pruning PETL-transferred models. By evaluating parameters from a feature perspective of each layer and utilizing clustering metrics to assess current parameters based on clustering phenomena in low-dimensional space obtained through t-SNE, SLS facilitates informed pruning decisions. Our study reveals that layer-wise pruning, with a focus on storing pruning indices, addresses storage volume concerns. Notably, mainstream Layer-wise pruning methods may not be suitable for assessing layer importance in PETL-transferred models, where the majority of parameters are pre-trained and have limited relevance to downstream datasets. Comparative analysis against state-of-the-art PETL methods demonstrates that the pruned model achieved a notable balance between model throughput and accuracy. Moreover, SLS effectively reduces storage overhead arising from varying pruned structures while enhancing the accuracy and speed of pruned models compared to conventional pruning methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14330v1-abstract-full').style.display = 'none'; document.getElementById('2407.14330v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">published to ECCV2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.14047">arXiv:2407.14047</a> <span> [<a href="https://arxiv.org/pdf/2407.14047">pdf</a>, <a href="https://arxiv.org/format/2407.14047">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> OCTrack: Benchmarking the Open-Corpus Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zekun Qian</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.14047v1-abstract-short" style="display: inline;"> We study a novel yet practical problem of open-corpus multi-object tracking (OCMOT), which extends the MOT into localizing, associating, and recognizing generic-category objects of both seen (base) and unseen (novel) classes, but without the category text list as prompt. To study this problem, the top priority is to build a benchmark. In this work, we build OCTrackB, a large-scale and comprehensiv… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14047v1-abstract-full').style.display = 'inline'; document.getElementById('2407.14047v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.14047v1-abstract-full" style="display: none;"> We study a novel yet practical problem of open-corpus multi-object tracking (OCMOT), which extends the MOT into localizing, associating, and recognizing generic-category objects of both seen (base) and unseen (novel) classes, but without the category text list as prompt. To study this problem, the top priority is to build a benchmark. In this work, we build OCTrackB, a large-scale and comprehensive benchmark, to provide a standard evaluation platform for the OCMOT problem. Compared to previous datasets, OCTrackB has more abundant and balanced base/novel classes and the corresponding samples for evaluation with less bias. We also propose a new multi-granularity recognition metric to better evaluate the generative object recognition in OCMOT. By conducting the extensive benchmark evaluation, we report and analyze the results of various state-of-the-art methods, which demonstrate the rationale of OCMOT, as well as the usefulness and advantages of OCTrackB. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.14047v1-abstract-full').style.display = 'none'; document.getElementById('2407.14047v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13998">arXiv:2407.13998</a> <span> [<a href="https://arxiv.org/pdf/2407.13998">pdf</a>, <a href="https://arxiv.org/format/2407.13998">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RAG-QA Arena: Evaluating Domain Robustness for Long-form Retrieval Augmented Question Answering </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Rujun Han</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuhao Zhang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+P">Peng Qi</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yumo Xu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jenyuan Wang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W+Y">William Yang Wang</a>, <a href="/search/cs?searchtype=author&query=Min%2C+B">Bonan Min</a>, <a href="/search/cs?searchtype=author&query=Castelli%2C+V">Vittorio Castelli</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13998v2-abstract-short" style="display: inline;"> Question answering based on retrieval augmented generation (RAG-QA) is an important research topic in NLP and has a wide range of real-world applications. However, most existing datasets for this task are either constructed using a single source corpus or consist of short extractive answers, which fall short of evaluating large language model (LLM) based RAG-QA systems on cross-domain generalizati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13998v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13998v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13998v2-abstract-full" style="display: none;"> Question answering based on retrieval augmented generation (RAG-QA) is an important research topic in NLP and has a wide range of real-world applications. However, most existing datasets for this task are either constructed using a single source corpus or consist of short extractive answers, which fall short of evaluating large language model (LLM) based RAG-QA systems on cross-domain generalization. To address these limitations, we create Long-form RobustQA (LFRQA), a new dataset comprising human-written long-form answers that integrate short extractive answers from multiple documents into a single, coherent narrative, covering 26K queries and large corpora across seven different domains. We further propose RAG-QA Arena by directly comparing model-generated answers against LFRQA's answers using LLMs as evaluators. We show via extensive experiments that RAG-QA Arena and human judgments on answer quality are highly correlated. Moreover, only 41.3% of the most competitive LLM's answers are preferred to LFRQA's answers, demonstrating RAG-QA Arena as a challenging evaluation platform for future research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13998v2-abstract-full').style.display = 'none'; document.getElementById('2407.13998v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.13274">arXiv:2407.13274</a> <span> [<a href="https://arxiv.org/pdf/2407.13274">pdf</a>, <a href="https://arxiv.org/format/2407.13274">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Aligning Explanations for Recommendation with Rating and Feature via Maximizing Mutual Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yurou Zhao</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Yiding Sun</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruidong Han</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+F">Fei Jiang</a>, <a href="/search/cs?searchtype=author&query=Guan%2C+L">Lu Guan</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+W">Weizhi Ma</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+J">Jiaxin Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.13274v2-abstract-short" style="display: inline;"> Providing natural language-based explanations to justify recommendations helps to improve users' satisfaction and gain users' trust. However, as current explanation generation methods are commonly trained with an objective to mimic existing user reviews, the generated explanations are often not aligned with the predicted ratings or some important features of the recommended items, and thus, are su… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13274v2-abstract-full').style.display = 'inline'; document.getElementById('2407.13274v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.13274v2-abstract-full" style="display: none;"> Providing natural language-based explanations to justify recommendations helps to improve users' satisfaction and gain users' trust. However, as current explanation generation methods are commonly trained with an objective to mimic existing user reviews, the generated explanations are often not aligned with the predicted ratings or some important features of the recommended items, and thus, are suboptimal in helping users make informed decision on the recommendation platform. To tackle this problem, we propose a flexible model-agnostic method named MMI (Maximizing Mutual Information) framework to enhance the alignment between the generated natural language explanations and the predicted rating/important item features. Specifically, we propose to use mutual information (MI) as a measure for the alignment and train a neural MI estimator. Then, we treat a well-trained explanation generation model as the backbone model and further fine-tune it through reinforcement learning with guidance from the MI estimator, which rewards a generated explanation that is more aligned with the predicted rating or a pre-defined feature of the recommended item. Experiments on three datasets demonstrate that our MMI framework can boost different backbone models, enabling them to outperform existing baselines in terms of alignment with predicted ratings and item features. Additionally, user studies verify that MI-enhanced explanations indeed facilitate users' decisions and are favorable compared with other baselines due to their better alignment properties. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.13274v2-abstract-full').style.display = 'none'; document.getElementById('2407.13274v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 August, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 18 July, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper has been accepted by cikm2024, and the code repository will be updated soon</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2407.00912">arXiv:2407.00912</a> <span> [<a href="https://arxiv.org/pdf/2407.00912">pdf</a>, <a href="https://arxiv.org/format/2407.00912">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3637528.3671519">10.1145/3637528.3671519 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> Unified Dual-Intent Translation for Joint Modeling of Search and Recommendation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yuting Zhang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yiqing Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruidong Han</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+Y">Yongchun Zhu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+W">Wei Lin</a>, <a href="/search/cs?searchtype=author&query=Zhuang%2C+F">Fuzhen Zhuang</a>, <a href="/search/cs?searchtype=author&query=An%2C+Z">Zhulin An</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+Y">Yongjun Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2407.00912v1-abstract-short" style="display: inline;"> Recommendation systems, which assist users in discovering their preferred items among numerous options, have served billions of users across various online platforms. Intuitively, users' interactions with items are highly driven by their unchanging inherent intents (e.g., always preferring high-quality items) and changing demand intents (e.g., wanting a T-shirt in summer but a down jacket in winte… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00912v1-abstract-full').style.display = 'inline'; document.getElementById('2407.00912v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2407.00912v1-abstract-full" style="display: none;"> Recommendation systems, which assist users in discovering their preferred items among numerous options, have served billions of users across various online platforms. Intuitively, users' interactions with items are highly driven by their unchanging inherent intents (e.g., always preferring high-quality items) and changing demand intents (e.g., wanting a T-shirt in summer but a down jacket in winter). However, both types of intents are implicitly expressed in recommendation scenario, posing challenges in leveraging them for accurate intent-aware recommendations. Fortunately, in search scenario, often found alongside recommendation on the same online platform, users express their demand intents explicitly through their query words. Intuitively, in both scenarios, a user shares the same inherent intent and the interactions may be influenced by the same demand intent. It is therefore feasible to utilize the interaction data from both scenarios to reinforce the dual intents for joint intent-aware modeling. But the joint modeling should deal with two problems: 1) accurately modeling users' implicit demand intents in recommendation; 2) modeling the relation between the dual intents and the interactive items. To address these problems, we propose a novel model named Unified Dual-Intents Translation for joint modeling of Search and Recommendation (UDITSR). To accurately simulate users' demand intents in recommendation, we utilize real queries from search data as supervision information to guide its generation. To explicitly model the relation among the triplet <inherent intent, demand intent, interactive item>, we propose a dual-intent translation propagation mechanism to learn the triplet in the same semantic space via embedding translations. Extensive experiments demonstrate that UDITSR outperforms SOTA baselines both in search and recommendation tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2407.00912v1-abstract-full').style.display = 'none'; document.getElementById('2407.00912v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> July 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2406.07579">arXiv:2406.07579</a> <span> [<a href="https://arxiv.org/pdf/2406.07579">pdf</a>, <a href="https://arxiv.org/format/2406.07579">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> GFPack++: Improving 2D Irregular Packing by Learning Gradient Field with Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xue%2C+T">Tianyang Xue</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+L">Lin Lu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+M">Mingdong Wu</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+H">Hao Dong</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yanbin Zhang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Renmin Han</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+B">Baoquan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2406.07579v1-abstract-short" style="display: inline;"> 2D irregular packing is a classic combinatorial optimization problem with various applications, such as material utilization and texture atlas generation. This NP-hard problem requires efficient algorithms to optimize space utilization. Conventional numerical methods suffer from slow convergence and high computational cost. Existing learning-based methods, such as the score-based diffusion model,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07579v1-abstract-full').style.display = 'inline'; document.getElementById('2406.07579v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2406.07579v1-abstract-full" style="display: none;"> 2D irregular packing is a classic combinatorial optimization problem with various applications, such as material utilization and texture atlas generation. This NP-hard problem requires efficient algorithms to optimize space utilization. Conventional numerical methods suffer from slow convergence and high computational cost. Existing learning-based methods, such as the score-based diffusion model, also have limitations, such as no rotation support, frequent collisions, and poor adaptability to arbitrary boundaries, and slow inferring. The difficulty of learning from teacher packing is to capture the complex geometric relationships among packing examples, which include the spatial (position, orientation) relationships of objects, their geometric features, and container boundary conditions. Representing these relationships in latent space is challenging. We propose GFPack++, an attention-based gradient field learning approach that addresses this challenge. It consists of two pivotal strategies: \emph{attention-based geometry encoding} for effective feature encoding and \emph{attention-based relation encoding} for learning complex relationships. We investigate the utilization distribution between the teacher and inference data and design a weighting function to prioritize tighter teacher data during training, enhancing learning effectiveness. Our diffusion model supports continuous rotation and outperforms existing methods on various datasets. We achieve higher space utilization over several widely used baselines, one-order faster than the previous diffusion-based method, and promising generalization for arbitrary boundaries. We plan to release our source code and datasets to support further research in this direction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2406.07579v1-abstract-full').style.display = 'none'; document.getElementById('2406.07579v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> June 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18284">arXiv:2405.18284</a> <span> [<a href="https://arxiv.org/pdf/2405.18284">pdf</a>, <a href="https://arxiv.org/format/2405.18284">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Adaptive debiased SGD in high-dimensional GLMs with streaming data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruijian Han</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lan Luo</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+Y">Yuanhang Luo</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yuanyuan Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jian Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18284v2-abstract-short" style="display: inline;"> Online statistical inference facilitates real-time analysis of sequentially collected data, making it different from traditional methods that rely on static datasets. This paper introduces a novel approach to online inference in high-dimensional generalized linear models, where we update regression coefficient estimates and their standard errors upon each new data arrival. In contrast to existing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18284v2-abstract-full').style.display = 'inline'; document.getElementById('2405.18284v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18284v2-abstract-full" style="display: none;"> Online statistical inference facilitates real-time analysis of sequentially collected data, making it different from traditional methods that rely on static datasets. This paper introduces a novel approach to online inference in high-dimensional generalized linear models, where we update regression coefficient estimates and their standard errors upon each new data arrival. In contrast to existing methods that either require full dataset access or large-dimensional summary statistics storage, our method operates in a single-pass mode, significantly reducing both time and space complexity. The core of our methodological innovation lies in an adaptive stochastic gradient descent algorithm tailored for dynamic objective functions, coupled with a novel online debiasing procedure. This allows us to maintain low-dimensional summary statistics while effectively controlling optimization errors introduced by the dynamically changing loss functions. We demonstrate that our method, termed the Approximated Debiased Lasso (ADL), not only mitigates the need for the bounded individual probability condition but also significantly improves numerical performance. Numerical experiments demonstrate that the proposed ADL method consistently exhibits robust performance across various covariance matrix structures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18284v2-abstract-full').style.display = 'none'; document.getElementById('2405.18284v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 June, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 pages, 4 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.18102">arXiv:2405.18102</a> <span> [<a href="https://arxiv.org/pdf/2405.18102">pdf</a>, <a href="https://arxiv.org/ps/2405.18102">ps</a>, <a href="https://arxiv.org/format/2405.18102">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Science and Game Theory">cs.GT</span> </div> </div> <p class="title is-5 mathjax"> Apportionment with Weighted Seats </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chingoma%2C+J">Julian Chingoma</a>, <a href="/search/cs?searchtype=author&query=Endriss%2C+U">Ulle Endriss</a>, <a href="/search/cs?searchtype=author&query=de+Haan%2C+R">Ronald de Haan</a>, <a href="/search/cs?searchtype=author&query=Haret%2C+A">Adrian Haret</a>, <a href="/search/cs?searchtype=author&query=Maly%2C+J">Jan Maly</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.18102v1-abstract-short" style="display: inline;"> Apportionment is the task of assigning resources to entities with different entitlements in a fair manner, and specifically a manner that is as proportional as possible. The best-known application concerns the assignment of parliamentary seats to political parties based on their share in the popular vote. Here we enrich the standard model of apportionment by associating each seat with a weight tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18102v1-abstract-full').style.display = 'inline'; document.getElementById('2405.18102v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.18102v1-abstract-full" style="display: none;"> Apportionment is the task of assigning resources to entities with different entitlements in a fair manner, and specifically a manner that is as proportional as possible. The best-known application concerns the assignment of parliamentary seats to political parties based on their share in the popular vote. Here we enrich the standard model of apportionment by associating each seat with a weight that reflects the value of that seat, for example because seats come with different roles, such as chair or treasurer, that have different (objective) values. We define several apportionment methods and natural fairness requirements for this new setting, and study the extent to which our methods satisfy our requirements. Our findings show that full fairness is harder to achieve than in the standard apportionment setting. At the same time, for relaxations of those requirements we can achieve stronger results than in the more general model of weighted fair division, where the values of objects are subjective. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.18102v1-abstract-full').style.display = 'none'; document.getElementById('2405.18102v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2405.02965">arXiv:2405.02965</a> <span> [<a href="https://arxiv.org/pdf/2405.02965">pdf</a>, <a href="https://arxiv.org/format/2405.02965">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Robust Collaborative Perception without External Localization and Clock Devices </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lei%2C+Z">Zixing Lei</a>, <a href="/search/cs?searchtype=author&query=Ni%2C+Z">Zhenyang Ni</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+S">Shuo Tang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+D">Dingju Wang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+C">Chen Feng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siheng Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanfeng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2405.02965v2-abstract-short" style="display: inline;"> A consistent spatial-temporal coordination across multiple agents is fundamental for collaborative perception, which seeks to improve perception abilities through information exchange among agents. To achieve this spatial-temporal alignment, traditional methods depend on external devices to provide localization and clock signals. However, hardware-generated signals could be vulnerable to noise and… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02965v2-abstract-full').style.display = 'inline'; document.getElementById('2405.02965v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2405.02965v2-abstract-full" style="display: none;"> A consistent spatial-temporal coordination across multiple agents is fundamental for collaborative perception, which seeks to improve perception abilities through information exchange among agents. To achieve this spatial-temporal alignment, traditional methods depend on external devices to provide localization and clock signals. However, hardware-generated signals could be vulnerable to noise and potentially malicious attack, jeopardizing the precision of spatial-temporal alignment. Rather than relying on external hardwares, this work proposes a novel approach: aligning by recognizing the inherent geometric patterns within the perceptual data of various agents. Following this spirit, we propose a robust collaborative perception system that operates independently of external localization and clock devices. The key module of our system,~\emph{FreeAlign}, constructs a salient object graph for each agent based on its detected boxes and uses a graph neural network to identify common subgraphs between agents, leading to accurate relative pose and time. We validate \emph{FreeAlign} on both real-world and simulated datasets. The results show that, the ~\emph{FreeAlign} empowered robust collaborative perception system perform comparably to systems relying on precise localization and clock devices. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2405.02965v2-abstract-full').style.display = 'none'; document.getElementById('2405.02965v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 May, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> May 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">6pages, accepted to ICRA 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2404.01693">arXiv:2404.01693</a> <span> [<a href="https://arxiv.org/pdf/2404.01693">pdf</a>, <a href="https://arxiv.org/format/2404.01693">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> HeMeNet: Heterogeneous Multichannel Equivariant Network for Protein Multitask Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Rong Han</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wenbing Huang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+L">Lingxiao Luo</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xinyan Han</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+J">Jiaming Shen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiqiang Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jun Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Ting Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2404.01693v1-abstract-short" style="display: inline;"> Understanding and leveraging the 3D structures of proteins is central to a variety of biological and drug discovery tasks. While deep learning has been applied successfully for structure-based protein function prediction tasks, current methods usually employ distinct training for each task. However, each of the tasks is of small size, and such a single-task strategy hinders the models' performance… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01693v1-abstract-full').style.display = 'inline'; document.getElementById('2404.01693v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2404.01693v1-abstract-full" style="display: none;"> Understanding and leveraging the 3D structures of proteins is central to a variety of biological and drug discovery tasks. While deep learning has been applied successfully for structure-based protein function prediction tasks, current methods usually employ distinct training for each task. However, each of the tasks is of small size, and such a single-task strategy hinders the models' performance and generalization ability. As some labeled 3D protein datasets are biologically related, combining multi-source datasets for larger-scale multi-task learning is one way to overcome this problem. In this paper, we propose a neural network model to address multiple tasks jointly upon the input of 3D protein structures. In particular, we first construct a standard structure-based multi-task benchmark called Protein-MT, consisting of 6 biologically relevant tasks, including affinity prediction and property prediction, integrated from 4 public datasets. Then, we develop a novel graph neural network for multi-task learning, dubbed Heterogeneous Multichannel Equivariant Network (HeMeNet), which is E(3) equivariant and able to capture heterogeneous relationships between different atoms. Besides, HeMeNet can achieve task-specific learning via the task-aware readout mechanism. Extensive evaluations on our benchmark verify the effectiveness of multi-task learning, and our model generally surpasses state-of-the-art models. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2404.01693v1-abstract-full').style.display = 'none'; document.getElementById('2404.01693v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 April, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> April 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.16430">arXiv:2403.16430</a> <span> [<a href="https://arxiv.org/pdf/2403.16430">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1145/3636534.3649382">10.1145/3636534.3649382 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> AeroBridge: Autonomous Drone Handoff System for Emergency Battery Service </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Seth%2C+A">Avishkar Seth</a>, <a href="/search/cs?searchtype=author&query=James%2C+A">Alice James</a>, <a href="/search/cs?searchtype=author&query=Kuantama%2C+E">Endrowednes Kuantama</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Richard Han</a>, <a href="/search/cs?searchtype=author&query=Mukhopadhyay%2C+S">Subhas Mukhopadhyay</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.16430v1-abstract-short" style="display: inline;"> This paper proposes an Emergency Battery Service (EBS) for drones in which an EBS drone flies to a drone in the field with a depleted battery and transfers a fresh battery to the exhausted drone. The authors present a unique battery transfer mechanism and drone localization that uses the Cross Marker Position (CMP) method. The main challenges include a stable and balanced transfer that precisely l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16430v1-abstract-full').style.display = 'inline'; document.getElementById('2403.16430v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.16430v1-abstract-full" style="display: none;"> This paper proposes an Emergency Battery Service (EBS) for drones in which an EBS drone flies to a drone in the field with a depleted battery and transfers a fresh battery to the exhausted drone. The authors present a unique battery transfer mechanism and drone localization that uses the Cross Marker Position (CMP) method. The main challenges include a stable and balanced transfer that precisely localizes the receiver drone. The proposed EBS drone mitigates the effects of downwash due to the vertical proximity between the drones by implementing diagonal alignment with the receiver, reducing the distance to 0.5 m between the two drones. CFD analysis shows that diagonal instead of perpendicular alignment minimizes turbulence, and the authors verify the actual system for change in output airflow and thrust measurements. The CMP marker-based localization method enables position lock for the EBS drone with up to 0.9 cm accuracy. The performance of the transfer mechanism is validated experimentally by successful mid-air transfer in 5 seconds, where the EBS drone is within 0.5 m vertical distance from the receiver drone, wherein 4m/s turbulence does not affect the transfer process. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.16430v1-abstract-full').style.display = 'none'; document.getElementById('2403.16430v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.13667">arXiv:2403.13667</a> <span> [<a href="https://arxiv.org/pdf/2403.13667">pdf</a>, <a href="https://arxiv.org/format/2403.13667">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> DanceCamera3D: 3D Camera Movement Synthesis with Music and Dance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zixuan Wang</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+J">Jia Jia</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+S">Shikun Sun</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+H">Haozhe Wu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rong Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhenyu Li</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+D">Di Tang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+J">Jiaqing Zhou</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.13667v1-abstract-short" style="display: inline;"> Choreographers determine what the dances look like, while cameramen determine the final presentation of dances. Recently, various methods and datasets have showcased the feasibility of dance synthesis. However, camera movement synthesis with music and dance remains an unsolved challenging problem due to the scarcity of paired data. Thus, we present DCM, a new multi-modal 3D dataset, which for the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13667v1-abstract-full').style.display = 'inline'; document.getElementById('2403.13667v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.13667v1-abstract-full" style="display: none;"> Choreographers determine what the dances look like, while cameramen determine the final presentation of dances. Recently, various methods and datasets have showcased the feasibility of dance synthesis. However, camera movement synthesis with music and dance remains an unsolved challenging problem due to the scarcity of paired data. Thus, we present DCM, a new multi-modal 3D dataset, which for the first time combines camera movement with dance motion and music audio. This dataset encompasses 108 dance sequences (3.2 hours) of paired dance-camera-music data from the anime community, covering 4 music genres. With this dataset, we uncover that dance camera movement is multifaceted and human-centric, and possesses multiple influencing factors, making dance camera synthesis a more challenging task compared to camera or dance synthesis alone. To overcome these difficulties, we propose DanceCamera3D, a transformer-based diffusion model that incorporates a novel body attention loss and a condition separation strategy. For evaluation, we devise new metrics measuring camera movement quality, diversity, and dancer fidelity. Utilizing these metrics, we conduct extensive experiments on our DCM dataset, providing both quantitative and qualitative evidence showcasing the effectiveness of our DanceCamera3D model. Code and video demos are available at https://github.com/Carmenw1203/DanceCamera3D-Official. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.13667v1-abstract-full').style.display = 'none'; document.getElementById('2403.13667v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accept to CVPR 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.06828">arXiv:2403.06828</a> <span> [<a href="https://arxiv.org/pdf/2403.06828">pdf</a>, <a href="https://arxiv.org/format/2403.06828">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> NeuPAN: Direct Point Robot Navigation with End-to-End Model-based Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruihua Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuaijun Wang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeqing Zhang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+J">Jianjun Chen</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+S">Shijie Lin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chengyang Li</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a>, <a href="/search/cs?searchtype=author&query=Eldar%2C+Y+C">Yonina C. Eldar</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qi Hao</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+J">Jia Pan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.06828v3-abstract-short" style="display: inline;"> Navigating a nonholonomic robot in a cluttered, unknown environment requires accurate perception and precise motion control for real-time collision avoidance. This paper presents NeuPAN: a real-time, highly accurate, map-free, easy-to-deploy, and environment-invariant robot motion planner. Leveraging a tightly coupled perception-to-control framework, NeuPAN has two key innovations compared to exis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06828v3-abstract-full').style.display = 'inline'; document.getElementById('2403.06828v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.06828v3-abstract-full" style="display: none;"> Navigating a nonholonomic robot in a cluttered, unknown environment requires accurate perception and precise motion control for real-time collision avoidance. This paper presents NeuPAN: a real-time, highly accurate, map-free, easy-to-deploy, and environment-invariant robot motion planner. Leveraging a tightly coupled perception-to-control framework, NeuPAN has two key innovations compared to existing approaches: 1) it directly maps raw point cloud data to a latent distance feature space for collision-free motion generation, avoiding error propagation from the perception to control pipeline; 2) it is interpretable from an end-to-end model-based learning perspective. The crux of NeuPAN is solving an end-to-end mathematical model with numerous point-level constraints using a plug-and-play (PnP) proximal alternating-minimization network (PAN), incorporating neurons in the loop. This allows NeuPAN to generate real-time, physically interpretable motions. It seamlessly integrates data and knowledge engines, and its network parameters can be fine-tuned via backpropagation. We evaluate NeuPAN on a ground mobile robot, a wheel-legged robot, and an autonomous vehicle, in extensive simulated and real-world environments. Results demonstrate that NeuPAN outperforms existing baselines in terms of accuracy, efficiency, robustness, and generalization capabilities across various environments, including the cluttered sandbox, office, corridor, and parking lot. We show that NeuPAN works well in unknown and unstructured environments with arbitrarily shaped objects, transforming impassable paths into passable ones. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.06828v3-abstract-full').style.display = 'none'; document.getElementById('2403.06828v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by TRO 2025; project website: https://hanruihua.github.io/neupan_project/</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.03541">arXiv:2403.03541</a> <span> [<a href="https://arxiv.org/pdf/2403.03541">pdf</a>, <a href="https://arxiv.org/format/2403.03541">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Seamless Virtual Reality with Integrated Synchronizer and Synthesizer for Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+H">He Li</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruihua Han</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zirui Zhao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wei Xu</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Q">Qi Hao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuai Wang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengzhong Xu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.03541v1-abstract-short" style="display: inline;"> Virtual reality (VR) is a promising data engine for autonomous driving (AD). However, data fidelity in this paradigm is often degraded by VR inconsistency, for which the existing VR approaches become ineffective, as they ignore the inter-dependency between low-level VR synchronizer designs (i.e., data collector) and high-level VR synthesizer designs (i.e., data processor). This paper presents a se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03541v1-abstract-full').style.display = 'inline'; document.getElementById('2403.03541v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.03541v1-abstract-full" style="display: none;"> Virtual reality (VR) is a promising data engine for autonomous driving (AD). However, data fidelity in this paradigm is often degraded by VR inconsistency, for which the existing VR approaches become ineffective, as they ignore the inter-dependency between low-level VR synchronizer designs (i.e., data collector) and high-level VR synthesizer designs (i.e., data processor). This paper presents a seamless virtual reality SVR platform for AD, which mitigates such inconsistency, enabling VR agents to interact with each other in a shared symbiotic world. The crux to SVR is an integrated synchronizer and synthesizer IS2 design, which consists of a drift-aware lidar-inertial synchronizer for VR colocation and a motion-aware deep visual synthesis network for augmented reality image generation. We implement SVR on car-like robots in two sandbox platforms, achieving a cm-level VR colocalization accuracy and 3.2% VR image deviation, thereby avoiding missed collisions or model clippings. Experiments show that the proposed SVR reduces the intervention times, missed turns, and failure rates compared to other benchmarks. The SVR-trained neural network can handle unseen situations in real-world environments, by leveraging its knowledge learnt from the VR space. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.03541v1-abstract-full').style.display = 'none'; document.getElementById('2403.03541v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2403.01513">arXiv:2403.01513</a> <span> [<a href="https://arxiv.org/pdf/2403.01513">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> CDSE-UNet: Enhancing COVID-19 CT Image Segmentation with Canny Edge Detection and Dual-Path SENet Feature Fusion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ding%2C+J">Jiao Ding</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+J">Jie Chang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Renrui Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+L">Li Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2403.01513v1-abstract-short" style="display: inline;"> Accurate segmentation of COVID-19 CT images is crucial for reducing the severity and mortality rates associated with COVID-19 infections. In response to blurred boundaries and high variability characteristic of lesion areas in COVID-19 CT images, we introduce CDSE-UNet: a novel UNet-based segmentation model that integrates Canny operator edge detection and a dual-path SENet feature fusion mechanis… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01513v1-abstract-full').style.display = 'inline'; document.getElementById('2403.01513v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2403.01513v1-abstract-full" style="display: none;"> Accurate segmentation of COVID-19 CT images is crucial for reducing the severity and mortality rates associated with COVID-19 infections. In response to blurred boundaries and high variability characteristic of lesion areas in COVID-19 CT images, we introduce CDSE-UNet: a novel UNet-based segmentation model that integrates Canny operator edge detection and a dual-path SENet feature fusion mechanism. This model enhances the standard UNet architecture by employing the Canny operator for edge detection in sample images, paralleling this with a similar network structure for semantic feature extraction. A key innovation is the Double SENet Feature Fusion Block, applied across corresponding network layers to effectively combine features from both image paths. Moreover, we have developed a Multiscale Convolution approach, replacing the standard Convolution in UNet, to adapt to the varied lesion sizes and shapes. This addition not only aids in accurately classifying lesion edge pixels but also significantly improves channel differentiation and expands the capacity of the model. Our evaluations on public datasets demonstrate CDSE-UNet's superior performance over other leading models, particularly in segmenting large and small lesion areas, accurately delineating lesion edges, and effectively suppressing noise <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2403.01513v1-abstract-full').style.display = 'none'; document.getElementById('2403.01513v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> March 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.02108">arXiv:2402.02108</a> <span> [<a href="https://arxiv.org/pdf/2402.02108">pdf</a>, <a href="https://arxiv.org/format/2402.02108">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Synthetic-To-Real Video Person Re-ID </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xiangqun Zhang</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Likai Wang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+L">Linqi Song</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.02108v3-abstract-short" style="display: inline;"> Person re-identification (Re-ID) is an important task and has significant applications for public security and information forensics, which has progressed rapidly with the development of deep learning. In this work, we investigate a novel and challenging setting of Re-ID, i.e., cross-domain video-based person Re-ID. Specifically, we utilize synthetic video datasets as the source domain for trainin… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02108v3-abstract-full').style.display = 'inline'; document.getElementById('2402.02108v3-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.02108v3-abstract-full" style="display: none;"> Person re-identification (Re-ID) is an important task and has significant applications for public security and information forensics, which has progressed rapidly with the development of deep learning. In this work, we investigate a novel and challenging setting of Re-ID, i.e., cross-domain video-based person Re-ID. Specifically, we utilize synthetic video datasets as the source domain for training and real-world videos for testing, notably reducing the reliance on expensive real data acquisition and annotation. To harness the potential of synthetic data, we first propose a self-supervised domain-invariant feature learning strategy for both static and dynamic (temporal) features. Additionally, to enhance person identification accuracy in the target domain, we propose a mean-teacher scheme incorporating a self-supervised ID consistency loss. Experimental results across five real datasets validate the rationale behind cross-synthetic-real domain adaptation and demonstrate the efficacy of our method. Notably, the discovery that synthetic data outperforms real data in the cross-domain scenario is a surprising outcome. The code and data are publicly available at https://github.com/XiangqunZhang/UDA_Video_ReID. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.02108v3-abstract-full').style.display = 'none'; document.getElementById('2402.02108v3-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 February, 2025; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 3 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2402.01808">arXiv:2402.01808</a> <span> [<a href="https://arxiv.org/pdf/2402.01808">pdf</a>, <a href="https://arxiv.org/format/2402.01808">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> KS-Net: Multi-band joint speech restoration and enhancement network for 2024 ICASSP SSI Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+G">Guochen Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Runqiang Han</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chenglin Xu</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haoran Zhao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+N">Nan Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chen Zhang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+X">Xiguang Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+C">Chao Zhou</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qi Huang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+B">Bing Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2402.01808v1-abstract-short" style="display: inline;"> This paper presents the speech restoration and enhancement system created by the 1024K team for the ICASSP 2024 Speech Signal Improvement (SSI) Challenge. Our system consists of a generative adversarial network (GAN) in complex-domain for speech restoration and a fine-grained multi-band fusion module for speech enhancement. In the blind test set of SSI, the proposed system achieves an overall mean… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01808v1-abstract-full').style.display = 'inline'; document.getElementById('2402.01808v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2402.01808v1-abstract-full" style="display: none;"> This paper presents the speech restoration and enhancement system created by the 1024K team for the ICASSP 2024 Speech Signal Improvement (SSI) Challenge. Our system consists of a generative adversarial network (GAN) in complex-domain for speech restoration and a fine-grained multi-band fusion module for speech enhancement. In the blind test set of SSI, the proposed system achieves an overall mean opinion score (MOS) of 3.49 based on ITU-T P.804 and a Word Accuracy Rate (WAcc) of 0.78 for the real-time track, as well as an overall P.804 MOS of 3.43 and a WAcc of 0.78 for the non-real-time track, ranking 1st in both tracks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2402.01808v1-abstract-full').style.display = 'none'; document.getElementById('2402.01808v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 February, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> February 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to ICASSP 2024; Rank 1st in ICASSP 2024 Speech Signal Improvement (SSI) Challenge</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.17617">arXiv:2401.17617</a> <span> [<a href="https://arxiv.org/pdf/2401.17617">pdf</a>, <a href="https://arxiv.org/format/2401.17617">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Unveiling the Power of Self-supervision for Multi-view Multi-human Association and Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wei Feng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+F">Feifan Wang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Ruize Han</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+Z">Zekun Qian</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Song Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.17617v1-abstract-short" style="display: inline;"> Multi-view multi-human association and tracking (MvMHAT), is a new but important problem for multi-person scene video surveillance, aiming to track a group of people over time in each view, as well as to identify the same person across different views at the same time, which is different from previous MOT and multi-camera MOT tasks only considering the over-time human tracking. This way, the video… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17617v1-abstract-full').style.display = 'inline'; document.getElementById('2401.17617v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.17617v1-abstract-full" style="display: none;"> Multi-view multi-human association and tracking (MvMHAT), is a new but important problem for multi-person scene video surveillance, aiming to track a group of people over time in each view, as well as to identify the same person across different views at the same time, which is different from previous MOT and multi-camera MOT tasks only considering the over-time human tracking. This way, the videos for MvMHAT require more complex annotations while containing more information for self learning. In this work, we tackle this problem with a self-supervised learning aware end-to-end network. Specifically, we propose to take advantage of the spatial-temporal self-consistency rationale by considering three properties of reflexivity, symmetry and transitivity. Besides the reflexivity property that naturally holds, we design the self-supervised learning losses based on the properties of symmetry and transitivity, for both appearance feature learning and assignment matrix optimization, to associate the multiple humans over time and across views. Furthermore, to promote the research on MvMHAT, we build two new large-scale benchmarks for the network training and testing of different algorithms. Extensive experiments on the proposed benchmarks verify the effectiveness of our method. We have released the benchmark and code to the public. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.17617v1-abstract-full').style.display = 'none'; document.getElementById('2401.17617v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.15839">arXiv:2401.15839</a> <span> [<a href="https://arxiv.org/pdf/2401.15839">pdf</a>, <a href="https://arxiv.org/format/2401.15839">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Networking and Internet Architecture">cs.NI</span> </div> </div> <p class="title is-5 mathjax"> Swarm: Cost-Efficient Video Content Distribution with a Peer-to-Peer System </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wei%2C+D">Dehui Wei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Jiao Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Haozhe Li</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+Z">Zhichen Xue</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+Y">Yajie Peng</a>, <a href="/search/cs?searchtype=author&query=Pang%2C+X">Xiaofei Pang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+R">Rui Han</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Y">Yan Ma</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jialin Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.15839v1-abstract-short" style="display: inline;"> As ByteDance's business expands, the substantial infrastructure expenses associated with centralized Content Delivery Network (CDN) networks have rendered content distribution costs prohibitively high. In response, we embarked on exploring a peer-to-peer (P2P) network as a promising solution to alleviate the escalating costs of content distribution. However, the decentralized nature of P2P often i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15839v1-abstract-full').style.display = 'inline'; document.getElementById('2401.15839v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.15839v1-abstract-full" style="display: none;"> As ByteDance's business expands, the substantial infrastructure expenses associated with centralized Content Delivery Network (CDN) networks have rendered content distribution costs prohibitively high. In response, we embarked on exploring a peer-to-peer (P2P) network as a promising solution to alleviate the escalating costs of content distribution. However, the decentralized nature of P2P often introduces performance challenges, given the diversity and dispersion of peer devices. This study introduces Swarm, ByteDance's innovative hybrid system for video streaming. Swarm seamlessly integrates the robustness of a conventional CDN with the cost-efficiency of a decentralized P2P network. Its primary aim is to provide users with reliable streaming quality while minimizing traffic expenses. To achieve this, Swarm employs a centralized control plane comprised of a tracker cluster, overseeing a data plane with numerous edge residual resources. The tracker also takes on the responsibility of mapping clients to servers. Addressing the performance disparities among individual peer servers, Swarm utilizes our proprietary multipath parallel transmission method for communication between clients and peer servers. Operating stably for six years, Swarm now manages over a hundred thousand peer servers, serving nearly a hundred million users daily and saving the company hundreds of millions of RMB annually. Experimental results affirm that, while significantly cutting costs, Swarm performs on par with traditional CDNs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.15839v1-abstract-full').style.display = 'none'; document.getElementById('2401.15839v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 28 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2401.03697">arXiv:2401.03697</a> <span> [<a href="https://arxiv.org/pdf/2401.03697">pdf</a>, <a href="https://arxiv.org/format/2401.03697">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> An audio-quality-based multi-strategy approach for target speaker extraction in the MISP 2023 Challenge </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+R">Runduo Han</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+X">Xiaopeng Yan</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Weiming Xu</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+P">Pengcheng Guo</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiayao Sun</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">He Wang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Q">Quan Lu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+N">Ning Jiang</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+L">Lei Xie</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2401.03697v2-abstract-short" style="display: inline;"> This paper describes our audio-quality-based multi-strategy approach for the audio-visual target speaker extraction (AVTSE) task in the Multi-modal Information based Speech Processing (MISP) 2023 Challenge. Specifically, our approach adopts different extraction strategies based on the audio quality, striking a balance between interference removal and speech preservation, which benifits the back-en… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03697v2-abstract-full').style.display = 'inline'; document.getElementById('2401.03697v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2401.03697v2-abstract-full" style="display: none;"> This paper describes our audio-quality-based multi-strategy approach for the audio-visual target speaker extraction (AVTSE) task in the Multi-modal Information based Speech Processing (MISP) 2023 Challenge. Specifically, our approach adopts different extraction strategies based on the audio quality, striking a balance between interference removal and speech preservation, which benifits the back-end automatic speech recognition (ASR) systems. Experiments show that our approach achieves a character error rate (CER) of 24.2% and 33.2% on the Dev and Eval set, respectively, obtaining the second place in the challenge. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2401.03697v2-abstract-full').style.display = 'none'; document.getElementById('2401.03697v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 March, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 8 January, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> January 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ICASSP 2024</span> </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Han%2C+R&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Han%2C+R&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository