Search | arXiv e-print repository

<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/>  <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b">  <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\$","\$"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a>  <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div>  <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div>  <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 1,452 results for author: <span class="mathjax">Yang, C</span> </h1> </div> <div class="level-right is-hidden-mobile">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Yang%2C+C">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Yang, C"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Yang%2C+C&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Yang, C"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13770">arXiv:2411.13770</a> <span> [<a href="https://arxiv.org/pdf/2411.13770">pdf</a>, <a href="https://arxiv.org/format/2411.13770">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> <div class="is-inline-block" style="margin-left: 0.5rem"> <div class="tags has-addons"> <span class="tag is-dark is-size-7">doi</span> <span class="tag is-light is-size-7"><a class="" href="https://doi.org/10.1109/TBME.2024.3469242">10.1109/TBME.2024.3469242 <i class="fa fa-external-link" aria-hidden="true"></i></a></span> </div> </div> </div> <p class="title is-5 mathjax"> A Novel Passive Occupational Shoulder Exoskeleton With Adjustable Peak Assistive Torque Angle For Overhead Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tian%2C+J">Jin Tian</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+H">Haiqi Zhu</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+C">Changjia Lu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chifu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yingjie Liu</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+B">Baichun Wei</a>, <a href="/search/cs?searchtype=author&query=Yi%2C+C">Chunzhi Yi</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13770v1-abstract-short" style="display: inline;"> Objective: Overhead tasks are a primary inducement to work-related musculoskeletal disorders. Aiming to reduce shoulder physical loads, passive shoulder exoskeletons are increasingly prevalent in the industry due to their lightweight, affordability, and effectiveness. However, they can only handle specific tasks and struggle to balance compactness with a sufficient range of motion effectively. Met… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13770v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13770v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13770v1-abstract-full" style="display: none;"> Objective: Overhead tasks are a primary inducement to work-related musculoskeletal disorders. Aiming to reduce shoulder physical loads, passive shoulder exoskeletons are increasingly prevalent in the industry due to their lightweight, affordability, and effectiveness. However, they can only handle specific tasks and struggle to balance compactness with a sufficient range of motion effectively. Method: We proposed a novel passive occupational shoulder exoskeleton designed to handle various overhead tasks at different arm elevation angles, ensuring sufficient ROM while maintaining compactness. By formulating kinematic models and simulations, an ergonomic shoulder structure was developed. Then, we presented a torque generator equipped with an adjustable peak assistive torque angle to switch between low and high assistance phases through a passive clutch mechanism. Ten healthy participants were recruited to validate its functionality by performing the screwing task. Results: Measured range of motion results demonstrated that the exoskeleton can ensure a sufficient ROM in both sagittal (164$^\circ$) and horizontal (158$^\circ$) flexion/extension movements. The experimental results of the screwing task showed that the exoskeleton could reduce muscle activation (up to 49.6%), perceived effort and frustration, and provide an improved user experience (scored 79.7 out of 100). Conclusion: These results indicate that the proposed exoskeleton can guarantee natural movements and provide efficient assistance during overhead work, and thus have the potential to reduce the risk of musculoskeletal disorders. Significance: The proposed exoskeleton provides insights into multi-task adaptability and efficient assistance, highlighting the potential for expanding the application of exoskeletons. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13770v1-abstract-full').style.display = 'none'; document.getElementById('2411.13770v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Journal ref:</span> IEEE Transactions on Biomedical Engineering,2024 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13700">arXiv:2411.13700</a> <span> [<a href="https://arxiv.org/pdf/2411.13700">pdf</a>, <a href="https://arxiv.org/format/2411.13700">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> A Collaborative Ensemble Framework for CTR Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Song%2C+W">Weinan Song</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+M">Mengyue Hang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Kim%2C+D">Donghyun Kim</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Yen Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+J">Jiyan Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiping Han</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+R">Rong Jin</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13700v1-abstract-short" style="display: inline;"> Recent advances in foundation models have established scaling laws that enable the development of larger models to achieve enhanced performance, motivating extensive research into large-scale recommendation models. However, simply increasing the model size in recommendation systems, even with large amounts of data, does not always result in the expected performance improvements. In this paper, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13700v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13700v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13700v1-abstract-full" style="display: none;"> Recent advances in foundation models have established scaling laws that enable the development of larger models to achieve enhanced performance, motivating extensive research into large-scale recommendation models. However, simply increasing the model size in recommendation systems, even with large amounts of data, does not always result in the expected performance improvements. In this paper, we propose a novel framework, Collaborative Ensemble Training Network (CETNet), to leverage multiple distinct models, each with its own embedding table, to capture unique feature interaction patterns. Unlike naive model scaling, our approach emphasizes diversity and collaboration through collaborative learning, where models iteratively refine their predictions. To dynamically balance contributions from each model, we introduce a confidence-based fusion mechanism using general softmax, where model confidence is computed via negation entropy. This design ensures that more confident models have a greater influence on the final prediction while benefiting from the complementary strengths of other models. We validate our framework on three public datasets (AmazonElectronics, TaobaoAds, and KuaiVideo) as well as a large-scale industrial dataset from Meta, demonstrating its superior performance over individual models and state-of-the-art baselines. Additionally, we conduct further experiments on the Criteo and Avazu datasets to compare our method with the multi-embedding paradigm. Our results show that our framework achieves comparable or better performance with smaller embedding sizes, offering a scalable and efficient solution for CTR prediction tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13700v1-abstract-full').style.display = 'none'; document.getElementById('2411.13700v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13083">arXiv:2411.13083</a> <span> [<a href="https://arxiv.org/pdf/2411.13083">pdf</a>, <a href="https://arxiv.org/format/2411.13083">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Omnipredicting Single-Index Models with Multi-Index Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Hu%2C+L">Lunjia Hu</a>, <a href="/search/cs?searchtype=author&query=Tian%2C+K">Kevin Tian</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chutong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13083v1-abstract-short" style="display: inline;"> Recent work on supervised learning [GKR+22] defined the notion of omnipredictors, i.e., predictor functions $p$ over features that are simultaneously competitive for minimizing a family of loss functions $\mathcal{L}$ against a comparator class $\mathcal{C}$. Omniprediction requires approximating the Bayes-optimal predictor beyond the loss minimization paradigm, and has generated significant inter… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13083v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13083v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13083v1-abstract-full" style="display: none;"> Recent work on supervised learning [GKR+22] defined the notion of omnipredictors, i.e., predictor functions $p$ over features that are simultaneously competitive for minimizing a family of loss functions $\mathcal{L}$ against a comparator class $\mathcal{C}$. Omniprediction requires approximating the Bayes-optimal predictor beyond the loss minimization paradigm, and has generated significant interest in the learning theory community. However, even for basic settings such as agnostically learning single-index models (SIMs), existing omnipredictor constructions require impractically-large sample complexities and runtimes, and output complex, highly-improper hypotheses. Our main contribution is a new, simple construction of omnipredictors for SIMs. We give a learner outputting an omnipredictor that is $\varepsilon$-competitive on any matching loss induced by a monotone, Lipschitz link function, when the comparator class is bounded linear predictors. Our algorithm requires $\approx \varepsilon^{-4}$ samples and runs in nearly-linear time, and its sample complexity improves to $\approx \varepsilon^{-2}$ if link functions are bi-Lipschitz. This significantly improves upon the only prior known construction, due to [HJKRR18, GHK+23], which used $\gtrsim \varepsilon^{-10}$ samples. We achieve our construction via a new, sharp analysis of the classical Isotron algorithm [KS09, KKKS11] in the challenging agnostic learning setting, of potential independent interest. Previously, Isotron was known to properly learn SIMs in the realizable setting, as well as constant-factor competitive hypotheses under the squared loss [ZWDD24]. As they are based on Isotron, our omnipredictors are multi-index models with $\approx \varepsilon^{-2}$ prediction heads, bringing us closer to the tantalizing goal of proper omniprediction for general loss families and comparators. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13083v1-abstract-full').style.display = 'none'; document.getElementById('2411.13083v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11922">arXiv:2411.11922</a> <span> [<a href="https://arxiv.org/pdf/2411.11922">pdf</a>, <a href="https://arxiv.org/format/2411.11922">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAMURAI: Adapting Segment Anything Model for Zero-Shot Visual Tracking with Motion-Aware Memory </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Yen Yang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsiang-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Chai%2C+W">Wenhao Chai</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11922v1-abstract-short" style="display: inline;"> The Segment Anything Model 2 (SAM 2) has demonstrated strong performance in object segmentation tasks but faces challenges in visual object tracking, particularly when managing crowded scenes with fast-moving or self-occluding objects. Furthermore, the fixed-window memory approach in the original model does not consider the quality of memories selected to condition the image features for the next… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11922v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11922v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11922v1-abstract-full" style="display: none;"> The Segment Anything Model 2 (SAM 2) has demonstrated strong performance in object segmentation tasks but faces challenges in visual object tracking, particularly when managing crowded scenes with fast-moving or self-occluding objects. Furthermore, the fixed-window memory approach in the original model does not consider the quality of memories selected to condition the image features for the next frame, leading to error propagation in videos. This paper introduces SAMURAI, an enhanced adaptation of SAM 2 specifically designed for visual object tracking. By incorporating temporal motion cues with the proposed motion-aware memory selection mechanism, SAMURAI effectively predicts object motion and refines mask selection, achieving robust, accurate tracking without the need for retraining or fine-tuning. SAMURAI operates in real-time and demonstrates strong zero-shot performance across diverse benchmark datasets, showcasing its ability to generalize without fine-tuning. In evaluations, SAMURAI achieves significant improvements in success rate and precision over existing trackers, with a 7.1% AUC gain on LaSOT$_{\text{ext}}$ and a 3.5% AO gain on GOT-10k. Moreover, it achieves competitive results compared to fully supervised methods on LaSOT, underscoring its robustness in complex tracking scenarios and its potential for real-world applications in dynamic environments. Code and results are available at https://github.com/yangchris11/samurai. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11922v1-abstract-full').style.display = 'none'; document.getElementById('2411.11922v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11507">arXiv:2411.11507</a> <span> [<a href="https://arxiv.org/pdf/2411.11507">pdf</a>, <a href="https://arxiv.org/format/2411.11507">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SignEye: Traffic Sign Interpretation from Vehicle First-Person View </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Tao Han</a>, <a href="/search/cs?searchtype=author&query=SU%2C+Y">Yuejiao SU</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hongyuan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yi Wang</a>, <a href="/search/cs?searchtype=author&query=Chau%2C+L">Lap-Pui Chau</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11507v1-abstract-short" style="display: inline;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction naviga… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11507v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11507v1-abstract-full" style="display: none;"> Traffic signs play a key role in assisting autonomous driving systems (ADS) by enabling the assessment of vehicle behavior in compliance with traffic regulations and providing navigation instructions. However, current works are limited to basic sign understanding without considering the egocentric vehicle's spatial position, which fails to support further regulation assessment and direction navigation. Following the above issues, we introduce a new task: traffic sign interpretation from the vehicle's first-person view, referred to as TSI-FPV. Meanwhile, we develop a traffic guidance assistant (TGA) scenario application to re-explore the role of traffic signs in ADS as a complement to popular autonomous technologies (such as obstacle perception). Notably, TGA is not a replacement for electronic map navigation; rather, TGA can be an automatic tool for updating it and complementing it in situations such as offline conditions or temporary sign adjustments. Lastly, a spatial and semantic logic-aware stepwise reasoning pipeline (SignEye) is constructed to achieve the TSI-FPV and TGA, and an application-specific dataset (Traffic-CN) is built. Experiments show that TSI-FPV and TGA are achievable via our SignEye trained on Traffic-CN. The results also demonstrate that the TGA can provide complementary information to ADS beyond existing popular autonomous technologies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11507v1-abstract-full').style.display = 'none'; document.getElementById('2411.11507v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11390">arXiv:2411.11390</a> <span> [<a href="https://arxiv.org/pdf/2411.11390">pdf</a>, <a href="https://arxiv.org/format/2411.11390">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Association between built environment characteristics and school run traffic congestion in Beijing, China </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kang%2C+C">Chaogui Kang</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+X">Xiaxin Wu</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jialei Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11390v1-abstract-short" style="display: inline;"> School-escorted trips are a significant contributor to traffic congestion. Existing studies mainly compare road traffic during student pick-up/drop-off hours with off-peak times, often overlooking the fact that school-run traffic congestion is unevenly distributed across areas with different built environment characteristics. We examine the relationship between the built environment and school-run… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11390v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11390v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11390v1-abstract-full" style="display: none;"> School-escorted trips are a significant contributor to traffic congestion. Existing studies mainly compare road traffic during student pick-up/drop-off hours with off-peak times, often overlooking the fact that school-run traffic congestion is unevenly distributed across areas with different built environment characteristics. We examine the relationship between the built environment and school-run traffic congestion, using Beijing, China, as a case study. First, we use multi-source geospatial data to assess the built environment characteristics around schools across five dimensions: spatial concentration, transportation infrastructure, street topology, spatial richness, and scenescapes. Second, employing a generalized ordered logit model, we analyze how traffic congestion around schools varies during peak hours on school days, regular non-school days, and national college entrance exam days. Lastly, we identify the built environment factors contributing to school-run traffic congestion through multivariable linear regression and Shapley value explanations. Our findings reveal that: (1) School runs significantly exacerbate traffic congestion around schools, reducing the likelihood of free-flow by 8.34\% during school run times; (2) School-run traffic congestion is more severe in areas with multiple schools, bus stops, and scenescapes related to business and financial functions. These insights can inform the planning of new schools and urban upgrade strategies aimed at reducing traffic congestion. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11390v1-abstract-full').style.display = 'none'; document.getElementById('2411.11390v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">30 pages, 9 figures, 4 tables</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10919">arXiv:2411.10919</a> <span> [<a href="https://arxiv.org/pdf/2411.10919">pdf</a>, <a href="https://arxiv.org/format/2411.10919">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Modal Self-Supervised Learning for Surgical Feedback Effectiveness Assessment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gupta%2C+A">Arushi Gupta</a>, <a href="/search/cs?searchtype=author&query=Kocielnik%2C+R">Rafal Kocielnik</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jiayun Wang</a>, <a href="/search/cs?searchtype=author&query=Nasriddinov%2C+F">Firdavs Nasriddinov</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cherine Yang</a>, <a href="/search/cs?searchtype=author&query=Wong%2C+E">Elyssa Wong</a>, <a href="/search/cs?searchtype=author&query=Anandkumar%2C+A">Anima Anandkumar</a>, <a href="/search/cs?searchtype=author&query=Hung%2C+A">Andrew Hung</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10919v1-abstract-short" style="display: inline;"> During surgical training, real-time feedback from trainers to trainees is important for preventing errors and enhancing long-term skill acquisition. Accurately predicting the effectiveness of this feedback, specifically whether it leads to a change in trainee behavior, is crucial for developing methods for improving surgical training and education. However, relying on human annotations to assess f… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10919v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10919v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10919v1-abstract-full" style="display: none;"> During surgical training, real-time feedback from trainers to trainees is important for preventing errors and enhancing long-term skill acquisition. Accurately predicting the effectiveness of this feedback, specifically whether it leads to a change in trainee behavior, is crucial for developing methods for improving surgical training and education. However, relying on human annotations to assess feedback effectiveness is laborious and prone to biases, underscoring the need for an automated, scalable, and objective method. Creating such an automated system poses challenges, as it requires an understanding of both the verbal feedback delivered by the trainer and the visual context of the real-time surgical scene. To address this, we propose a method that integrates information from transcribed verbal feedback and corresponding surgical video to predict feedback effectiveness. Our findings show that both transcribed feedback and surgical video are individually predictive of trainee behavior changes, and their combination achieves an AUROC of 0.70+/-0.02, improving prediction accuracy by up to 6.6%. Additionally, we introduce self-supervised fine-tuning as a strategy for enhancing surgical video representation learning, which is scalable and further enhances prediction performance. Our results demonstrate the potential of multi-modal learning to advance the automated assessment of surgical feedback. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10919v1-abstract-full').style.display = 'none'; document.getElementById('2411.10919v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted as a spotlight proceedings paper at Machine Learning for Health 2024</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07; 68T45; 68U10; 92C50 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2; I.2.10; I.5.4; I.4.7; J.3; K.3.1 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10773">arXiv:2411.10773</a> <span> [<a href="https://arxiv.org/pdf/2411.10773">pdf</a>, <a href="https://arxiv.org/format/2411.10773">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> An End-to-End Real-World Camera Imaging Pipeline </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+K">Kepeng Xu</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+Z">Zijia Ma</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+L">Li Xu</a>, <a href="/search/cs?searchtype=author&query=He%2C+G">Gang He</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunsong Li</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+W">Wenxin Yu</a>, <a href="/search/cs?searchtype=author&query=Han%2C+T">Taichu Han</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10773v1-abstract-short" style="display: inline;"> Recent advances in neural camera imaging pipelines have demonstrated notable progress. Nevertheless, the real-world imaging pipeline still faces challenges including the lack of joint optimization in system components, computational redundancies, and optical distortions such as lens shading.In light of this, we propose an end-to-end camera imaging pipeline (RealCamNet) to enhance real-world camera… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10773v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10773v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10773v1-abstract-full" style="display: none;"> Recent advances in neural camera imaging pipelines have demonstrated notable progress. Nevertheless, the real-world imaging pipeline still faces challenges including the lack of joint optimization in system components, computational redundancies, and optical distortions such as lens shading.In light of this, we propose an end-to-end camera imaging pipeline (RealCamNet) to enhance real-world camera imaging performance. Our methodology diverges from conventional, fragmented multi-stage image signal processing towards end-to-end architecture. This architecture facilitates joint optimization across the full pipeline and the restoration of coordinate-biased distortions. RealCamNet is designed for high-quality conversion from RAW to RGB and compact image compression. Specifically, we deeply analyze coordinate-dependent optical distortions, e.g., vignetting and dark shading, and design a novel Coordinate-Aware Distortion Restoration (CADR) module to restore coordinate-biased distortions. Furthermore, we propose a Coordinate-Independent Mapping Compression (CIMC) module to implement tone mapping and redundant information compression. Existing datasets suffer from misalignment and overly idealized conditions, making them inadequate for training real-world imaging pipelines. Therefore, we collected a real-world imaging dataset. Experiment results show that RealCamNet achieves the best rate-distortion performance with lower inference latency. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10773v1-abstract-full').style.display = 'none'; document.getElementById('2411.10773v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accept by ACMMM 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.10130">arXiv:2411.10130</a> <span> [<a href="https://arxiv.org/pdf/2411.10130">pdf</a>, <a href="https://arxiv.org/format/2411.10130">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Towards Multi-View Consistent Style Transfer with One-Step Diffusion via Vision Conditioning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zuo%2C+Y">Yushen Zuo</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Chan%2C+K">Kin-Chung Chan</a>, <a href="/search/cs?searchtype=author&query=Dong%2C+R">Rongkang Dong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cuixin Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zongqi He</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+H">Hao Xie</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+K">Kin-Man Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.10130v1-abstract-short" style="display: inline;"> The stylization of 3D scenes is an increasingly attractive topic in 3D vision. Although image style transfer has been extensively researched with promising results, directly applying 2D style transfer methods to 3D scenes often fails to preserve the structural and multi-view properties of 3D environments, resulting in unpleasant distortions in images from different viewpoints. To address these iss… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10130v1-abstract-full').style.display = 'inline'; document.getElementById('2411.10130v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.10130v1-abstract-full" style="display: none;"> The stylization of 3D scenes is an increasingly attractive topic in 3D vision. Although image style transfer has been extensively researched with promising results, directly applying 2D style transfer methods to 3D scenes often fails to preserve the structural and multi-view properties of 3D environments, resulting in unpleasant distortions in images from different viewpoints. To address these issues, we leverage the remarkable generative prior of diffusion-based models and propose a novel style transfer method, OSDiffST, based on a pre-trained one-step diffusion model (i.e., SD-Turbo) for rendering diverse styles in multi-view images of 3D scenes. To efficiently adapt the pre-trained model for multi-view style transfer on small datasets, we introduce a vision condition module to extract style information from the reference style image to serve as conditional input for the diffusion model and employ LoRA in diffusion model for adaptation. Additionally, we consider color distribution alignment and structural similarity between the stylized and content images using two specific loss functions. As a result, our method effectively preserves the structural information and multi-view consistency in stylized images without any 3D information. Experiments show that our method surpasses other promising style transfer methods in synthesizing various styles for multi-view images of 3D scenes. Stylized images from different viewpoints generated by our method achieve superior visual quality, with better structural integrity and less distortion. The source code is available at https://github.com/YushenZuo/OSDiffST. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.10130v1-abstract-full').style.display = 'none'; document.getElementById('2411.10130v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ECCV 2024 AI for Visual Arts Workshop and Challenges, 18 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09852">arXiv:2411.09852</a> <span> [<a href="https://arxiv.org/pdf/2411.09852">pdf</a>, <a href="https://arxiv.org/format/2411.09852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InterFormer: Towards Effective Heterogeneous Interaction Learning for Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+M">Mengyue Hang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinghai Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Y">Yichen Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Laming Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yujia Hao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaqi Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Buyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wei Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Yen Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiping Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09852v1-abstract-short" style="display: inline;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09852v1-abstract-full" style="display: none;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. However, most of the existing methods suffer from two fundamental limitations, including (1) insufficient inter-mode interaction due to the unidirectional information flow between modes, and (2) aggressive information aggregation caused by early summarization, resulting in excessive information loss. To address the above limitations, we propose a novel module named InterFormer to learn heterogeneous information interaction in an interleaving style. To achieve better interaction learning, InterFormer enables bidirectional information flow for mutually beneficial learning across different modes. To avoid aggressive information aggregation, we retain complete information in each data mode and use a separate bridging arch for effective information selection and summarization. Our proposed InterFormer achieves state-of-the-art performance on three public datasets and a large-scale industrial dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'none'; document.getElementById('2411.09852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09547">arXiv:2411.09547</a> <span> [<a href="https://arxiv.org/pdf/2411.09547">pdf</a>, <a href="https://arxiv.org/format/2411.09547">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Piecing It All Together: Verifying Multi-Hop Multimodal Claims </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+H">Haoran Wang</a>, <a href="/search/cs?searchtype=author&query=Rangapur%2C+A">Aman Rangapur</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiongxiao Xu</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yueqing Liang</a>, <a href="/search/cs?searchtype=author&query=Gharwi%2C+H">Haroon Gharwi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=Shu%2C+K">Kai Shu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09547v1-abstract-short" style="display: inline;"> Existing claim verification datasets often do not require systems to perform complex reasoning or effectively interpret multimodal evidence. To address this, we introduce a new task: multi-hop multimodal claim verification. This task challenges models to reason over multiple pieces of evidence from diverse sources, including text, images, and tables, and determine whether the combined multimodal e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09547v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09547v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09547v1-abstract-full" style="display: none;"> Existing claim verification datasets often do not require systems to perform complex reasoning or effectively interpret multimodal evidence. To address this, we introduce a new task: multi-hop multimodal claim verification. This task challenges models to reason over multiple pieces of evidence from diverse sources, including text, images, and tables, and determine whether the combined multimodal evidence supports or refutes a given claim. To study this task, we construct MMCV, a large-scale dataset comprising 16k multi-hop claims paired with multimodal evidence, generated and refined using large language models, with additional input from human feedback. We show that MMCV is challenging even for the latest state-of-the-art multimodal large language models, especially as the number of reasoning hops increases. Additionally, we establish a human performance benchmark on a subset of MMCV. We hope this dataset and its evaluation task will encourage future research in multimodal multi-hop claim verification. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09547v1-abstract-full').style.display = 'none'; document.getElementById('2411.09547v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09403">arXiv:2411.09403</a> <span> [<a href="https://arxiv.org/pdf/2411.09403">pdf</a>, <a href="https://arxiv.org/format/2411.09403">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Quantum Machine Learning: An Interplay Between Quantum Computing and Machine Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jun Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao-Han Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pin-Yu Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09403v1-abstract-short" style="display: inline;"> Quantum machine learning (QML) is a rapidly growing field that combines quantum computing principles with traditional machine learning. It seeks to revolutionize machine learning by harnessing the unique capabilities of quantum mechanics and employs machine learning techniques to advance quantum computing research. This paper introduces quantum computing for the machine learning paradigm, where va… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09403v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09403v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09403v1-abstract-full" style="display: none;"> Quantum machine learning (QML) is a rapidly growing field that combines quantum computing principles with traditional machine learning. It seeks to revolutionize machine learning by harnessing the unique capabilities of quantum mechanics and employs machine learning techniques to advance quantum computing research. This paper introduces quantum computing for the machine learning paradigm, where variational quantum circuits (VQC) are used to develop QML architectures on noisy intermediate-scale quantum (NISQ) devices. We discuss machine learning for the quantum computing paradigm, showcasing our recent theoretical and empirical findings. In particular, we delve into future directions for studying QML, exploring the potential industrial impacts of QML research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09403v1-abstract-full').style.display = 'none'; document.getElementById('2411.09403v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08794">arXiv:2411.08794</a> <span> [<a href="https://arxiv.org/pdf/2411.08794">pdf</a>, <a href="https://arxiv.org/format/2411.08794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating World Models with LLM for Decision Making </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chang Yang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinrun Wang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Junzhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Q">Qinggang Zhang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+X">Xiao Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08794v1-abstract-short" style="display: inline;"> World model emerges as a key module in decision making, where MuZero and Dreamer achieve remarkable successes in complex tasks. Recent work leverages Large Language Models (LLMs) as general world simulators to simulate the dynamics of the world due to their generalizability. LLMs also serve as the world model for deliberative reasoning in Reasoning via Planning (RAP) and Tree of Thought (ToT). How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08794v1-abstract-full" style="display: none;"> World model emerges as a key module in decision making, where MuZero and Dreamer achieve remarkable successes in complex tasks. Recent work leverages Large Language Models (LLMs) as general world simulators to simulate the dynamics of the world due to their generalizability. LLMs also serve as the world model for deliberative reasoning in Reasoning via Planning (RAP) and Tree of Thought (ToT). However, the world models are either evaluated as a general world simulator, or as a functional module of the agent, i.e., predicting the transitions to assist the planning. In this work, we propose a comprehensive evaluation of the world models with LLMs from the decision making perspective. Specifically, we leverage the 31 diverse environments from (Wang et al., 2023;2024) and curate the rule-based policy of each environment for the diverse evaluation. Then, we design three main tasks, i.e., policy verification, action proposal, and policy planning, where the world models can be used for decision making solely. Finally, we conduct the comprehensive evaluation of the advanced LLMs, i.e., GPT-4o and GPT-4o-mini, on the environments for the three main tasks under various settings. The key observations include: i) GPT-4o significantly outperforms GPT-4o-mini on the three main tasks, especially for the tasks which require the domain knowledge, ii) the performance of the world model with LLM will be decreased for long-term decision-making tasks, and iii) the combination of different functionalities of the world model will brings additional unstabilities of the performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08794v1-abstract-full').style.display = 'none'; document.getElementById('2411.08794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08552">arXiv:2411.08552</a> <span> [<a href="https://arxiv.org/pdf/2411.08552">pdf</a>, <a href="https://arxiv.org/format/2411.08552">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Quantum Physics">quant-ph</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with Variational Quantum Circuits </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qi%2C+J">Jun Qi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao-Han Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+S+Y">Samuel Yen-Chi Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+P">Pin-Yu Chen</a>, <a href="/search/cs?searchtype=author&query=Zenil%2C+H">Hector Zenil</a>, <a href="/search/cs?searchtype=author&query=Tegner%2C+J">Jesper Tegner</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08552v1-abstract-short" style="display: inline;"> Quantum Machine Learning (QML) offers tremendous potential but is currently limited by the availability of qubits. We introduce an innovative approach that utilizes pre-trained neural networks to enhance Variational Quantum Circuits (VQC). This technique effectively separates approximation error from qubit count and removes the need for restrictive conditions, making QML more viable for real-world… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08552v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08552v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08552v1-abstract-full" style="display: none;"> Quantum Machine Learning (QML) offers tremendous potential but is currently limited by the availability of qubits. We introduce an innovative approach that utilizes pre-trained neural networks to enhance Variational Quantum Circuits (VQC). This technique effectively separates approximation error from qubit count and removes the need for restrictive conditions, making QML more viable for real-world applications. Our method significantly improves parameter optimization for VQC while delivering notable gains in representation and generalization capabilities, as evidenced by rigorous theoretical analysis and extensive empirical testing on quantum dot classification tasks. Moreover, our results extend to applications such as human genome analysis, demonstrating the broad applicability of our approach. By addressing the constraints of current quantum hardware, our work paves the way for a new era of advanced QML applications, unlocking the full potential of quantum computing in fields such as machine learning, materials science, medicine, mimetics, and various interdisciplinary areas. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08552v1-abstract-full').style.display = 'none'; document.getElementById('2411.08552v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 13 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In submission</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08216">arXiv:2411.08216</a> <span> [<a href="https://arxiv.org/pdf/2411.08216">pdf</a>, <a href="https://arxiv.org/format/2411.08216">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> GTA: Global Tracklet Association for Multi-Object Tracking in Sports </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Jiacheng Sun</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+H">Hsiang-Wei Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Yen Yang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Z">Zhongyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Hwang%2C+J">Jenq-Neng Hwang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08216v1-abstract-short" style="display: inline;"> Multi-object tracking in sports scenarios has become one of the focal points in computer vision, experiencing significant advancements through the integration of deep learning techniques. Despite these breakthroughs, challenges remain, such as accurately re-identifying players upon re-entry into the scene and minimizing ID switches. In this paper, we propose an appearance-based global tracklet ass… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08216v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08216v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08216v1-abstract-full" style="display: none;"> Multi-object tracking in sports scenarios has become one of the focal points in computer vision, experiencing significant advancements through the integration of deep learning techniques. Despite these breakthroughs, challenges remain, such as accurately re-identifying players upon re-entry into the scene and minimizing ID switches. In this paper, we propose an appearance-based global tracklet association algorithm designed to enhance tracking performance by splitting tracklets containing multiple identities and connecting tracklets seemingly from the same identity. This method can serve as a plug-and-play refinement tool for any multi-object tracker to further boost their performance. The proposed method achieved a new state-of-the-art performance on the SportsMOT dataset with HOTA score of 81.04%. Similarly, on the SoccerNet dataset, our method enhanced multiple trackers' performance, consistently increasing the HOTA score from 79.41% to 83.11%. These significant and consistent improvements across different trackers and datasets underscore our proposed method's potential impact on the application of sports player tracking. We open-source our project codebase at https://github.com/sjc042/gta-link.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08216v1-abstract-full').style.display = 'none'; document.getElementById('2411.08216v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by ACCV 2024 MLCSA Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08165">arXiv:2411.08165</a> <span> [<a href="https://arxiv.org/pdf/2411.08165">pdf</a>, <a href="https://arxiv.org/format/2411.08165">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Retrieval, Reasoning, Re-ranking: A Context-Enriched Framework for Knowledge Graph Completion </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Muzhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cehao Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengjin Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xuhui Jiang</a>, <a href="/search/cs?searchtype=author&query=Qi%2C+Y">Yiyan Qi</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+H">Ho-fung Leung</a>, <a href="/search/cs?searchtype=author&query=King%2C+I">Irwin King</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08165v1-abstract-short" style="display: inline;"> The Knowledge Graph Completion~(KGC) task aims to infer the missing entity from an incomplete triple. Existing embedding-based methods rely solely on triples in the KG, which is vulnerable to specious relation patterns and long-tail entities. On the other hand, text-based methods struggle with the semantic gap between KG triples and natural language. Apart from triples, entity contexts (e.g., labe… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08165v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08165v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08165v1-abstract-full" style="display: none;"> The Knowledge Graph Completion~(KGC) task aims to infer the missing entity from an incomplete triple. Existing embedding-based methods rely solely on triples in the KG, which is vulnerable to specious relation patterns and long-tail entities. On the other hand, text-based methods struggle with the semantic gap between KG triples and natural language. Apart from triples, entity contexts (e.g., labels, descriptions, aliases) also play a significant role in augmenting KGs. To address these limitations, we propose KGR3, a context-enriched framework for KGC. KGR3 is composed of three modules. Firstly, the Retrieval module gathers supporting triples from the KG, collects plausible candidate answers from a base embedding model, and retrieves context for each related entity. Then, the Reasoning module employs a large language model to generate potential answers for each query triple. Finally, the Re-ranking module combines candidate answers from the two modules mentioned above, and fine-tunes an LLM to provide the best answer. Extensive experiments on widely used datasets demonstrate that KGR3 consistently improves various KGC methods. Specifically, the best variant of KGR3 achieves absolute Hits@1 improvements of 12.3% and 5.6% on the FB15k237 and WN18RR datasets. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08165v1-abstract-full').style.display = 'none'; document.getElementById('2411.08165v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08147">arXiv:2411.08147</a> <span> [<a href="https://arxiv.org/pdf/2411.08147">pdf</a>, <a href="https://arxiv.org/format/2411.08147">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Large Language Models Can Self-Improve in Long-context Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+S">Siheng Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Z">Zesen Cheng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lemao Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+M">Mo Yu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Y">Yujiu Yang</a>, <a href="/search/cs?searchtype=author&query=Lam%2C+W">Wai Lam</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08147v1-abstract-short" style="display: inline;"> Large language models (LLMs) have achieved substantial progress in processing long contexts but still struggle with long-context reasoning. Existing approaches typically involve fine-tuning LLMs with synthetic data, which depends on annotations from human experts or advanced models like GPT-4, thus restricting further advancements. To address this issue, we investigate the potential for LLMs to se… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08147v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08147v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08147v1-abstract-full" style="display: none;"> Large language models (LLMs) have achieved substantial progress in processing long contexts but still struggle with long-context reasoning. Existing approaches typically involve fine-tuning LLMs with synthetic data, which depends on annotations from human experts or advanced models like GPT-4, thus restricting further advancements. To address this issue, we investigate the potential for LLMs to self-improve in long-context reasoning and propose \ours, an approach specifically designed for this purpose. This approach is straightforward: we sample multiple outputs for each question, score them with Minimum Bayes Risk, and then apply supervised fine-tuning or preference optimization based on these outputs. Extensive experiments on several leading LLMs demonstrate the effectiveness of \ours, with an absolute improvement of $4.2$ points for Llama-3.1-8B-Instruct. Furthermore, \ours achieves superior performance compared to prior approaches that depend on data produced by human experts or advanced models. We anticipate that this work will open new avenues for self-improvement techniques in long-context scenarios, which are essential for the continual advancement of LLMs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08147v1-abstract-full').style.display = 'none'; document.getElementById('2411.08147v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://github.com/SihengLi99/SEALONG</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08144">arXiv:2411.08144</a> <span> [<a href="https://arxiv.org/pdf/2411.08144">pdf</a>, <a href="https://arxiv.org/format/2411.08144">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Visual Tracking with Intermittent Visibility: Switched Control Design and Implementation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yangge Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+B+C">Benjamin C Yang</a>, <a href="/search/cs?searchtype=author&query=Mitra%2C+S">Sayan Mitra</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08144v1-abstract-short" style="display: inline;"> This paper addresses the problem of visual target tracking in scenarios where a pursuer may experience intermittent loss of visibility of the target. The design of a Switched Visual Tracker (SVT) is presented which aims to meet the competing requirements of maintaining both proximity and visibility. SVT alternates between a visual tracking mode for following the target, and a recovery mode for reg… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08144v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08144v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08144v1-abstract-full" style="display: none;"> This paper addresses the problem of visual target tracking in scenarios where a pursuer may experience intermittent loss of visibility of the target. The design of a Switched Visual Tracker (SVT) is presented which aims to meet the competing requirements of maintaining both proximity and visibility. SVT alternates between a visual tracking mode for following the target, and a recovery mode for regaining visual contact when the target falls out of sight. We establish the stability of SVT by extending the average dwell time theorem from switched systems theory, which may be of independent interest. Our implementation of SVT on an Agilicious drone [1] illustrates its effectiveness on tracking various target trajectories: it reduces the average tracking error by up to 45% and significantly improves visibility duration compared to a baseline algorithm. The results show that our approach effectively handles intermittent vision loss, offering enhanced robustness and adaptability for real-world autonomous missions. Additionally, we demonstrate how the stability analysis provides valuable guidance for selecting parameters, such as tracking speed and recovery distance, to optimize the SVT's performance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08144v1-abstract-full').style.display = 'none'; document.getElementById('2411.08144v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08056">arXiv:2411.08056</a> <span> [<a href="https://arxiv.org/pdf/2411.08056">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Physics and Society">physics.soc-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Emerging Technologies">cs.ET</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> </div> </div> <p class="title is-5 mathjax"> Biodynamic Analysis of Alpine Skiing with a Skier-Ski-Snow Interaction Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+N">Nan Gao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+H">Huitong Jin</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jianqiao Guo</a>, <a href="/search/cs?searchtype=author&query=Ren%2C+G">Gexue Ren</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08056v1-abstract-short" style="display: inline;"> This study establishes a skier-ski-snow interaction (SSSI) model that integrates a 3D full-body musculoskeletal model, a flexible ski model, a ski-snow contact model, and an air resistance model. An experimental method is developed to collect kinematic and kinetic data using IMUs, GPS, and plantar pressure measurement insoles, which are cost-effective and capable of capturing motion in large-scale… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08056v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08056v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08056v1-abstract-full" style="display: none;"> This study establishes a skier-ski-snow interaction (SSSI) model that integrates a 3D full-body musculoskeletal model, a flexible ski model, a ski-snow contact model, and an air resistance model. An experimental method is developed to collect kinematic and kinetic data using IMUs, GPS, and plantar pressure measurement insoles, which are cost-effective and capable of capturing motion in large-scale field conditions. The ski-snow interaction parameters are optimized for dynamic alignment with snow conditions and individual turning techniques. Forward-inverse dynamics simulation is performed using only the skier's posture as model input and leaving the translational degrees of freedom (DOFs) between the pelvis and the ground unconstrained. The effectiveness of our model is further verified by comparing the simulated results with the collected GPS and plantar pressure data. The correlation coefficient between the simulated ski-snow contact force and the measured plantar pressure data is 0.964, and the error between the predicted motion trajectory and GPS data is 0.7%. By extracting kinematic and kinetic parameters from skiers of different skill levels, quantitative performance analysis helps quantify ski training. The SSSI model with the parameter optimization algorithm of the ski-snow interaction allows for the description of skiing characteristics across varied snow conditions and different turning techniques, such as carving and skidding. Our research advances the understanding of alpine skiing dynamics, informing the development of training programs and facility designs to enhance athlete performance and safety. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08056v1-abstract-full').style.display = 'none'; document.getElementById('2411.08056v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07111">arXiv:2411.07111</a> <span> [<a href="https://arxiv.org/pdf/2411.07111">pdf</a>, <a href="https://arxiv.org/format/2411.07111">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Building a Taiwanese Mandarin Spoken Language Model: A First Attempt </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yu-Kuan Fu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yi-Cheng Lin</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Chung%2C+H+L">Ho Lam Chung</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+W">Wei-Ping Huang</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+T">Tzu-Quan Lin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+H">Hsiu-Hsuan Wang</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+E">En-Pei Hu</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+C">Chan-Jan Hsu</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+L">Liang-Hsuan Tseng</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+I">I-Hsiang Chiu</a>, <a href="/search/cs?searchtype=author&query=Sanga%2C+U">Ulin Sanga</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsu%2C+P">Po-chun Hsu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Lee%2C+H">Hung-yi Lee</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07111v1-abstract-short" style="display: inline;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07111v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07111v1-abstract-full" style="display: none;"> This technical report presents our initial attempt to build a spoken large language model (LLM) for Taiwanese Mandarin, specifically tailored to enable real-time, speech-to-speech interaction in multi-turn conversations. Our end-to-end model incorporates a decoder-only transformer architecture and aims to achieve seamless interaction while preserving the conversational flow, including full-duplex capabilities allowing simultaneous speaking and listening. The paper also details the training process, including data preparation with synthesized dialogues and adjustments for real-time interaction. We also developed a platform to evaluate conversational fluency and response coherence in multi-turn dialogues. We hope the release of the report can contribute to the future development of spoken LLMs in Taiwanese Mandarin. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07111v1-abstract-full').style.display = 'none'; document.getElementById('2411.07111v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07074">arXiv:2411.07074</a> <span> [<a href="https://arxiv.org/pdf/2411.07074">pdf</a>, <a href="https://arxiv.org/format/2411.07074">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Increasing Rosacea Awareness Among Population Using Deep Learning and Statistical Approaches </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chengyu Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chengjun Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07074v2-abstract-short" style="display: inline;"> Approximately 16 million Americans suffer from rosacea according to the National Rosacea Society. To increase rosacea awareness, automatic rosacea detection methods using deep learning and explainable statistical approaches are presented in this paper. The deep learning method applies the ResNet-18 for rosacea detection, and the statistical approaches utilize the means of the two classes, namely,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07074v2-abstract-full').style.display = 'inline'; document.getElementById('2411.07074v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07074v2-abstract-full" style="display: none;"> Approximately 16 million Americans suffer from rosacea according to the National Rosacea Society. To increase rosacea awareness, automatic rosacea detection methods using deep learning and explainable statistical approaches are presented in this paper. The deep learning method applies the ResNet-18 for rosacea detection, and the statistical approaches utilize the means of the two classes, namely, the rosacea class vs. the normal class, and the principal component analysis to extract features from the facial images for automatic rosacea detection. The contributions of the proposed methods are three-fold. First, the proposed methods are able to automatically distinguish patients who are suffering from rosacea from people who are clean of this disease. Second, the statistical approaches address the explainability issue that allows doctors and patients to understand and trust the results. And finally, the proposed methods will not only help increase rosacea awareness in the general population but also help remind the patients who suffer from this disease of possible early treatment since rosacea is more treatable at its early stages. The code and data are available at https://github.com/chengyuyang-njit/rosacea_detection.git. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07074v2-abstract-full').style.display = 'none'; document.getElementById('2411.07074v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to 2024 International Conference on Medical Imaging and Computer-Aided Diagnosis</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06823">arXiv:2411.06823</a> <span> [<a href="https://arxiv.org/pdf/2411.06823">pdf</a>, <a href="https://arxiv.org/format/2411.06823">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Large Language Model in Medical Informatics: Direct Classification and Enhanced Text Representations for Automatic ICD Coding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Boukhers%2C+Z">Zeyd Boukhers</a>, <a href="/search/cs?searchtype=author&query=Khan%2C+A">AmeerAli Khan</a>, <a href="/search/cs?searchtype=author&query=Ramadan%2C+Q">Qusai Ramadan</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cong Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06823v1-abstract-short" style="display: inline;"> Addressing the complexity of accurately classifying International Classification of Diseases (ICD) codes from medical discharge summaries is challenging due to the intricate nature of medical documentation. This paper explores the use of Large Language Models (LLM), specifically the LLAMA architecture, to enhance ICD code classification through two methodologies: direct application as a classifier… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06823v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06823v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06823v1-abstract-full" style="display: none;"> Addressing the complexity of accurately classifying International Classification of Diseases (ICD) codes from medical discharge summaries is challenging due to the intricate nature of medical documentation. This paper explores the use of Large Language Models (LLM), specifically the LLAMA architecture, to enhance ICD code classification through two methodologies: direct application as a classifier and as a generator of enriched text representations within a Multi-Filter Residual Convolutional Neural Network (MultiResCNN) framework. We evaluate these methods by comparing them against state-of-the-art approaches, revealing LLAMA's potential to significantly improve classification outcomes by providing deep contextual insights into medical texts. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06823v1-abstract-full').style.display = 'none'; document.getElementById('2411.06823v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">accepted at the 2024 IEEE International Conference on Bioinformatics and Biomedicine (BIBM 2024)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06381">arXiv:2411.06381</a> <span> [<a href="https://arxiv.org/pdf/2411.06381">pdf</a>, <a href="https://arxiv.org/format/2411.06381">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> SAN: Structure-Aware Network for Complex and Long-tailed Chinese Text Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+J">Junyi Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Chang Liu</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06381v1-abstract-short" style="display: inline;"> In text recognition, complex glyphs and tail classes have always been factors affecting model performance. Specifically for Chinese text recognition, the lack of shape-awareness can lead to confusion among close complex characters. Since such characters are often tail classes that appear less frequently in the training-set, making it harder for the model to capture its shape information. Hence in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06381v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06381v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06381v1-abstract-full" style="display: none;"> In text recognition, complex glyphs and tail classes have always been factors affecting model performance. Specifically for Chinese text recognition, the lack of shape-awareness can lead to confusion among close complex characters. Since such characters are often tail classes that appear less frequently in the training-set, making it harder for the model to capture its shape information. Hence in this work, we propose a structure-aware network utilizing the hierarchical composition information to improve the recognition performance of complex characters. Implementation-wise, we first propose an auxiliary radical branch and integrate it into the base recognition network as a regularization term, which distills hierarchical composition information into the feature extractor. A Tree-Similarity-based weighting mechanism is then proposed to further utilize the depth information in the hierarchical representation. Experiments demonstrate that the proposed approach can significantly improve the performances of complex characters and tail characters, yielding a better overall performance. Code is available at https://github.com/Levi-ZJY/SAN. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06381v1-abstract-full').style.display = 'none'; document.getElementById('2411.06381v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 10 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Published in ICDAR 2023</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05945">arXiv:2411.05945</a> <span> [<a href="https://arxiv.org/pdf/2411.05945">pdf</a>, <a href="https://arxiv.org/format/2411.05945">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> NeKo: Toward Post Recognition Generative Correction Large Language Models with Task-Oriented Experts </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yen-Ting Lin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+H">Chao-Han Huck Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhehuai Chen</a>, <a href="/search/cs?searchtype=author&query=Zelasko%2C+P">Piotr Zelasko</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xuesong Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zih-Ching Chen</a>, <a href="/search/cs?searchtype=author&query=Puvvada%2C+K+C">Krishna C Puvvada</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+S">Szu-Wei Fu</a>, <a href="/search/cs?searchtype=author&query=Hu%2C+K">Ke Hu</a>, <a href="/search/cs?searchtype=author&query=Chiu%2C+J+W">Jun Wei Chiu</a>, <a href="/search/cs?searchtype=author&query=Balam%2C+J">Jagadeesh Balam</a>, <a href="/search/cs?searchtype=author&query=Ginsburg%2C+B">Boris Ginsburg</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y+F">Yu-Chiang Frank Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05945v1-abstract-short" style="display: inline;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in pa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05945v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05945v1-abstract-full" style="display: none;"> Construction of a general-purpose post-recognition error corrector poses a crucial question: how can we most effectively train a model on a large mixture of domain datasets? The answer would lie in learning dataset-specific features and digesting their knowledge in a single model. Previous methods achieve this by having separate correction language models, resulting in a significant increase in parameters. In this work, we present Mixture-of-Experts as a solution, highlighting that MoEs are much more than a scalability tool. We propose a Multi-Task Correction MoE, where we train the experts to become an ``expert'' of speech-to-text, language-to-text and vision-to-text datasets by learning to route each dataset's tokens to its mapped expert. Experiments on the Open ASR Leaderboard show that we explore a new state-of-the-art performance by achieving an average relative $5.0$% WER reduction and substantial improvements in BLEU scores for speech and translation tasks. On zero-shot evaluation, NeKo outperforms GPT-3.5 and Claude-Opus with $15.5$% to $27.6$% relative WER reduction in the Hyporadise benchmark. NeKo performs competitively on grammar and post-OCR correction as a multi-task model. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05945v1-abstract-full').style.display = 'none'; document.getElementById('2411.05945v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeKo work has been done in June 2024. NeKo LMs will be open source on https://huggingface.co/nvidia under the MIT license</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05497">arXiv:2411.05497</a> <span> [<a href="https://arxiv.org/pdf/2411.05497">pdf</a>, <a href="https://arxiv.org/format/2411.05497">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> Tightly-Coupled, Speed-aided Monocular Visual-Inertial Localization in Topological Map </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chanuk Yang</a>, <a href="/search/cs?searchtype=author&query=O%2C+H">Hayeon O</a>, <a href="/search/cs?searchtype=author&query=Huh%2C+K">Kunsoo Huh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05497v1-abstract-short" style="display: inline;"> This paper proposes a novel algorithm for vehicle speed-aided monocular visual-inertial localization using a topological map. The proposed system aims to address the limitations of existing methods that rely heavily on expensive sensors like GPS and LiDAR by leveraging relatively inexpensive camera-based pose estimation. The topological map is generated offline from LiDAR point clouds and includes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05497v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05497v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05497v1-abstract-full" style="display: none;"> This paper proposes a novel algorithm for vehicle speed-aided monocular visual-inertial localization using a topological map. The proposed system aims to address the limitations of existing methods that rely heavily on expensive sensors like GPS and LiDAR by leveraging relatively inexpensive camera-based pose estimation. The topological map is generated offline from LiDAR point clouds and includes depth images, intensity images, and corresponding camera poses. This map is then used for real-time localization through correspondence matching between current camera images and the stored topological images. The system employs an Iterated Error State Kalman Filter (IESKF) for optimized pose estimation, incorporating correspondence among images and vehicle speed measurements to enhance accuracy. Experimental results using both open dataset and our collected data in challenging scenario, such as tunnel, demonstrate the proposed algorithm's superior performance in topological map generation and localization tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05497v1-abstract-full').style.display = 'none'; document.getElementById('2411.05497v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05361">arXiv:2411.05361</a> <span> [<a href="https://arxiv.org/pdf/2411.05361">pdf</a>, <a href="https://arxiv.org/format/2411.05361">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Dynamic-SUPERB Phase-2: A Collaboratively Expanding Benchmark for Measuring the Capabilities of Spoken Language Models with 180 Tasks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chien-yu Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wei-Chih Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+S">Shu-wen Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+A+T">Andy T. Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+C">Chen-An Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">Yu-Xiang Lin</a>, <a href="/search/cs?searchtype=author&query=Tseng%2C+W">Wei-Cheng Tseng</a>, <a href="/search/cs?searchtype=author&query=Diwan%2C+A">Anuj Diwan</a>, <a href="/search/cs?searchtype=author&query=Shih%2C+Y">Yi-Jen Shih</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiatong Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">William Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xuanjun Chen</a>, <a href="/search/cs?searchtype=author&query=Hsiao%2C+C">Chi-Yuan Hsiao</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+P">Puyuan Peng</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shih-Heng Wang</a>, <a href="/search/cs?searchtype=author&query=Kuan%2C+C">Chun-Yi Kuan</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+K">Ke-Han Lu</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+K">Kai-Wei Chang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chih-Kai Yang</a>, <a href="/search/cs?searchtype=author&query=Ritter-Gutierrez%2C+F">Fabian Ritter-Gutierrez</a>, <a href="/search/cs?searchtype=author&query=Chuang%2C+M+T">Ming To Chuang</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+K">Kuan-Po Huang</a>, <a href="/search/cs?searchtype=author&query=Arora%2C+S">Siddhant Arora</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Y">You-Kuan Lin</a>, <a href="/search/cs?searchtype=author&query=Yeo%2C+E">Eunjung Yeo</a> , et al. (53 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05361v1-abstract-short" style="display: inline;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05361v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05361v1-abstract-full" style="display: none;"> Multimodal foundation models, such as Gemini and ChatGPT, have revolutionized human-machine interactions by seamlessly integrating various forms of data. Developing a universal spoken language model that comprehends a wide range of natural language instructions is critical for bridging communication gaps and facilitating more intuitive interactions. However, the absence of a comprehensive evaluation benchmark poses a significant challenge. We present Dynamic-SUPERB Phase-2, an open and evolving benchmark for the comprehensive evaluation of instruction-based universal speech models. Building upon the first generation, this second version incorporates 125 new tasks contributed collaboratively by the global research community, expanding the benchmark to a total of 180 tasks, making it the largest benchmark for speech and audio evaluation. While the first generation of Dynamic-SUPERB was limited to classification tasks, Dynamic-SUPERB Phase-2 broadens its evaluation capabilities by introducing a wide array of novel and diverse tasks, including regression and sequence generation, across speech, music, and environmental audio. Evaluation results indicate that none of the models performed well universally. SALMONN-13B excelled in English ASR, while WavLLM demonstrated high accuracy in emotion recognition, but current models still require further innovations to handle a broader range of tasks. We will soon open-source all task data and the evaluation pipeline. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05361v1-abstract-full').style.display = 'none'; document.getElementById('2411.05361v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04798">arXiv:2411.04798</a> <span> [<a href="https://arxiv.org/pdf/2411.04798">pdf</a>, <a href="https://arxiv.org/format/2411.04798">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Orbit: A Framework for Designing and Evaluating Multi-objective Rankers </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenyang Yang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+T">Tesi Xiao</a>, <a href="/search/cs?searchtype=author&query=Shavlovsky%2C+M">Michael Shavlovsky</a>, <a href="/search/cs?searchtype=author&query=K%C3%A4stner%2C+C">Christian K盲stner</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+T">Tongshuang Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04798v1-abstract-short" style="display: inline;"> Machine learning in production needs to balance multiple objectives: This is particularly evident in ranking or recommendation models, where conflicting objectives such as user engagement, satisfaction, diversity, and novelty must be considered at the same time. However, designing multi-objective rankers is inherently a dynamic wicked problem -- there is no single optimal solution, and the needs e… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04798v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04798v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04798v1-abstract-full" style="display: none;"> Machine learning in production needs to balance multiple objectives: This is particularly evident in ranking or recommendation models, where conflicting objectives such as user engagement, satisfaction, diversity, and novelty must be considered at the same time. However, designing multi-objective rankers is inherently a dynamic wicked problem -- there is no single optimal solution, and the needs evolve over time. Effective design requires collaboration between cross-functional teams and careful analysis of a wide range of information. In this work, we introduce Orbit, a conceptual framework for Objective-centric Ranker Building and Iteration. The framework places objectives at the center of the design process, to serve as boundary objects for communication and guide practitioners for design and evaluation. We implement Orbit as an interactive system, which enables stakeholders to interact with objective spaces directly and supports real-time exploration and evaluation of design trade-offs. We evaluate Orbit through a user study involving twelve industry practitioners, showing that it supports efficient design space exploration, leads to more informed decision-making, and enhances awareness of the inherent trade-offs of multiple objectives. Orbit (1) opens up new opportunities of an objective-centric design process for any multi-objective ML models, as well as (2) sheds light on future designs that push practitioners to go beyond a narrow metric-centric or example-centric mindset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04798v1-abstract-full').style.display = 'none'; document.getElementById('2411.04798v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.04669">arXiv:2411.04669</a> <span> [<a href="https://arxiv.org/pdf/2411.04669">pdf</a>, <a href="https://arxiv.org/format/2411.04669">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> EffiCANet: Efficient Time Series Forecasting with Convolutional Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+X">Xinxing Zhou</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jiaqi Ye</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+S">Shubao Zhao</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+M">Ming Jin</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chengyi Yang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+Y">Yanlong Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+X">Xiaojie Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.04669v1-abstract-short" style="display: inline;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many mode… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'inline'; document.getElementById('2411.04669v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.04669v1-abstract-full" style="display: none;"> The exponential growth of multivariate time series data from sensor networks in domains like industrial monitoring and smart cities requires efficient and accurate forecasting models. Current deep learning methods often fail to adequately capture long-range dependencies and complex inter-variable relationships, especially under real-time processing constraints. These limitations arise as many models are optimized for either short-term forecasting with limited receptive fields or long-term accuracy at the cost of efficiency. Additionally, dynamic and intricate interactions between variables in real-world data further complicate modeling efforts. To address these limitations, we propose EffiCANet, an Efficient Convolutional Attention Network designed to enhance forecasting accuracy while maintaining computational efficiency. EffiCANet integrates three key components: (1) a Temporal Large-kernel Decomposed Convolution (TLDC) module that captures long-term temporal dependencies while reducing computational overhead; (2) an Inter-Variable Group Convolution (IVGC) module that captures complex and evolving relationships among variables; and (3) a Global Temporal-Variable Attention (GTVA) mechanism that prioritizes critical temporal and inter-variable features. Extensive evaluations across nine benchmark datasets show that EffiCANet achieves the maximum reduction of 10.02% in MAE over state-of-the-art models, while cutting computational costs by 26.2% relative to conventional large-kernel convolution methods, thanks to its efficient decomposition strategy. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.04669v1-abstract-full').style.display = 'none'; document.getElementById('2411.04669v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03556">arXiv:2411.03556</a> <span> [<a href="https://arxiv.org/pdf/2411.03556">pdf</a>, <a href="https://arxiv.org/format/2411.03556">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> VQ-ACE: Efficient Policy Search for Dexterous Robotic Manipulation via Action Chunking Embedding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenyu Yang</a>, <a href="/search/cs?searchtype=author&query=Liconti%2C+D">Davide Liconti</a>, <a href="/search/cs?searchtype=author&query=Katzschmann%2C+R+K">Robert K. Katzschmann</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03556v1-abstract-short" style="display: inline;"> Dexterous robotic manipulation remains a significant challenge due to the high dimensionality and complexity of hand movements required for tasks like in-hand manipulation and object grasping. This paper addresses this issue by introducing Vector Quantized Action Chunking Embedding (VQ-ACE), a novel framework that compresses human hand motion into a quantized latent space, significantly reducing t… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03556v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03556v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03556v1-abstract-full" style="display: none;"> Dexterous robotic manipulation remains a significant challenge due to the high dimensionality and complexity of hand movements required for tasks like in-hand manipulation and object grasping. This paper addresses this issue by introducing Vector Quantized Action Chunking Embedding (VQ-ACE), a novel framework that compresses human hand motion into a quantized latent space, significantly reducing the action space's dimensionality while preserving key motion characteristics. By integrating VQ-ACE with both Model Predictive Control (MPC) and Reinforcement Learning (RL), we enable more efficient exploration and policy learning in dexterous manipulation tasks using a biomimetic robotic hand. Our results show that latent space sampling with MPC produces more human-like behavior in tasks such as Ball Rolling and Object Picking, leading to higher task success rates and reduced control costs. For RL, action chunking accelerates learning and improves exploration, demonstrated through faster convergence in tasks like cube stacking and in-hand cube reorientation. These findings suggest that VQ-ACE offers a scalable and effective solution for robotic manipulation tasks involving complex, high-dimensional state spaces, contributing to more natural and adaptable robotic systems. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03556v1-abstract-full').style.display = 'none'; document.getElementById('2411.03556v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02794">arXiv:2411.02794</a> <span> [<a href="https://arxiv.org/pdf/2411.02794">pdf</a>, <a href="https://arxiv.org/format/2411.02794">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Real-Time Text Detection with Similar Mask in Traffic, Industrial, and Natural Scenes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Han%2C+X">Xu Han</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Junyu Gao</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+Y">Yuan Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Q">Qi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02794v1-abstract-short" style="display: inline;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which los… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02794v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02794v1-abstract-full" style="display: none;"> Texts on the intelligent transportation scene include mass information. Fully harnessing this information is one of the critical drivers for advancing intelligent transportation. Unlike the general scene, detecting text in transportation has extra demand, such as a fast inference speed, except for high accuracy. Most existing real-time text detection methods are based on the shrink mask, which loses some geometry semantic information and needs complex post-processing. In addition, the previous method usually focuses on correct output, which ignores feature correction and lacks guidance during the intermediate process. To this end, we propose an efficient multi-scene text detector that contains an effective text representation similar mask (SM) and a feature correction module (FCM). Unlike previous methods, the former aims to preserve the geometric information of the instances as much as possible. Its post-progressing saves 50$\%$ of the time, accurately and efficiently reconstructing text contours. The latter encourages false positive features to move away from the positive feature center, optimizing the predictions from the feature level. Some ablation studies demonstrate the efficiency of the SM and the effectiveness of the FCM. Moreover, the deficiency of existing traffic datasets (such as the low-quality annotation or closed source data unavailability) motivated us to collect and annotate a traffic text dataset, which introduces motion blur. In addition, to validate the scene robustness of the SM-Net, we conduct experiments on traffic, industrial, and natural scene datasets. Extensive experiments verify it achieves (SOTA) performance on several benchmarks. The code and dataset are available at: \url{https://github.com/fengmulin/SMNet}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02794v1-abstract-full').style.display = 'none'; document.getElementById('2411.02794v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02523">arXiv:2411.02523</a> <span> [<a href="https://arxiv.org/pdf/2411.02523">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Evaluating the Impact of Lab Test Results on Large Language Models Generated Differential Diagnoses from Clinical Case Vignettes </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Bhasuran%2C+B">Balu Bhasuran</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qiao Jin</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yuzhang Xie</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=Hanna%2C+K">Karim Hanna</a>, <a href="/search/cs?searchtype=author&query=Costa%2C+J">Jennifer Costa</a>, <a href="/search/cs?searchtype=author&query=Shavor%2C+C">Cindy Shavor</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+Z">Zhiyong Lu</a>, <a href="/search/cs?searchtype=author&query=He%2C+Z">Zhe He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02523v1-abstract-short" style="display: inline;"> Differential diagnosis is crucial for medicine as it helps healthcare providers systematically distinguish between conditions that share similar symptoms. This study assesses the impact of lab test results on differential diagnoses (DDx) made by large language models (LLMs). Clinical vignettes from 50 case reports from PubMed Central were created incorporating patient demographics, symptoms, and l… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02523v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02523v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02523v1-abstract-full" style="display: none;"> Differential diagnosis is crucial for medicine as it helps healthcare providers systematically distinguish between conditions that share similar symptoms. This study assesses the impact of lab test results on differential diagnoses (DDx) made by large language models (LLMs). Clinical vignettes from 50 case reports from PubMed Central were created incorporating patient demographics, symptoms, and lab results. Five LLMs GPT-4, GPT-3.5, Llama-2-70b, Claude-2, and Mixtral-8x7B were tested to generate Top 10, Top 5, and Top 1 DDx with and without lab data. A comprehensive evaluation involving GPT-4, a knowledge graph, and clinicians was conducted. GPT-4 performed best, achieving 55% accuracy for Top 1 diagnoses and 60% for Top 10 with lab data, with lenient accuracy up to 80%. Lab results significantly improved accuracy, with GPT-4 and Mixtral excelling, though exact match rates were low. Lab tests, including liver function, metabolic/toxicology panels, and serology/immune tests, were generally interpreted correctly by LLMs for differential diagnosis. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02523v1-abstract-full').style.display = 'none'; document.getElementById('2411.02523v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02457">arXiv:2411.02457</a> <span> [<a href="https://arxiv.org/pdf/2411.02457">pdf</a>, <a href="https://arxiv.org/format/2411.02457">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> A Multi-Task Role-Playing Agent Capable of Imitating Character Linguistic Styles </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+S">Siyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Si%2C+Q">Qingyi Si</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenxu Yang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+Y">Yunzhi Liang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+Z">Zheng Lin</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Huan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Weiping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02457v1-abstract-short" style="display: inline;"> The advent of large language models (LLMs) has significantly propelled the advancement of Role-Playing Agents (RPAs). However, current Role-Playing Agents predominantly focus on mimicking a character's fundamental attributes while neglecting the replication of linguistic style, and they are incapable of effectively replicating characters when performing tasks beyond multi-turn dialogues, which res… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02457v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02457v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02457v1-abstract-full" style="display: none;"> The advent of large language models (LLMs) has significantly propelled the advancement of Role-Playing Agents (RPAs). However, current Role-Playing Agents predominantly focus on mimicking a character's fundamental attributes while neglecting the replication of linguistic style, and they are incapable of effectively replicating characters when performing tasks beyond multi-turn dialogues, which results in generated responses that lack authenticity. The reason current RPAs lack this capability is due to the nature of existing character datasets, which lack collections of character quotations and are limited to multi-turn dialogue tasks, constraining the RPA's performance across other task domains and failing to mimic a character's linguistic style. To address this gap, we developed a multi-task role-playing dataset named MRstyle, which encompasses a substantial number of real individuals along with their quotations and covers seven different tasks. On this basis, we develop StyleRPA, a Multi-Task Role-Playing Agent (MRPA) that significantly outperforms recent open-source LLMs and RPAs baselines on 7 tasks including Dialogue, Dictionary, Composition, Story Generation, Product Description, Music Commentary, and Open Question Answering. The code and data will be released. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02457v1-abstract-full').style.display = 'none'; document.getElementById('2411.02457v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01663">arXiv:2411.01663</a> <span> [<a href="https://arxiv.org/pdf/2411.01663">pdf</a>, <a href="https://arxiv.org/format/2411.01663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computational Complexity">cs.CC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Unlocking the Theory Behind Scaling 1-Bit Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Daliri%2C+M">Majid Daliri</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zhao Song</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chiwun Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01663v1-abstract-short" style="display: inline;"> Recently, 1-bit Large Language Models (LLMs) have emerged, showcasing an impressive combination of efficiency and performance that rivals traditional LLMs. Research by Wang et al. (2023); Ma et al. (2024) indicates that the performance of these 1-bit LLMs progressively improves as the number of parameters increases, hinting at the potential existence of a Scaling Law for 1-bit Neural Networks. In… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01663v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01663v1-abstract-full" style="display: none;"> Recently, 1-bit Large Language Models (LLMs) have emerged, showcasing an impressive combination of efficiency and performance that rivals traditional LLMs. Research by Wang et al. (2023); Ma et al. (2024) indicates that the performance of these 1-bit LLMs progressively improves as the number of parameters increases, hinting at the potential existence of a Scaling Law for 1-bit Neural Networks. In this paper, we present the first theoretical result that rigorously establishes this scaling law for 1-bit models. We prove that, despite the constraint of weights restricted to $\{-1, +1\}$, the dynamics of model training inevitably align with kernel behavior as the network width grows. This theoretical breakthrough guarantees convergence of the 1-bit model to an arbitrarily small loss as width increases. Furthermore, we introduce the concept of the generalization difference, defined as the gap between the outputs of 1-bit networks and their full-precision counterparts, and demonstrate that this difference maintains a negligible level as network width scales. Building on the work of Kaplan et al. (2020), we conclude by examining how the training loss scales as a power-law function of the model size, dataset size, and computational resources utilized for training. Our findings underscore the promising potential of scaling 1-bit neural networks, suggesting that int1 could become the standard in future neural network precision. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01663v1-abstract-full').style.display = 'none'; document.getElementById('2411.01663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 3 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01176">arXiv:2411.01176</a> <span> [<a href="https://arxiv.org/pdf/2411.01176">pdf</a>, <a href="https://arxiv.org/format/2411.01176">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> CmdCaliper: A Semantic-Aware Command-Line Embedding Model and Dataset for Security Research </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+S">Sian-Yao Huang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng-Lin Yang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+C">Che-Yu Lin</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Chun-Ying Huang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01176v1-abstract-short" style="display: inline;"> This research addresses command-line embedding in cybersecurity, a field obstructed by the lack of comprehensive datasets due to privacy and regulation concerns. We propose the first dataset of similar command lines, named CyPHER, for training and unbiased evaluation. The training set is generated using a set of large language models (LLMs) comprising 28,520 similar command-line pairs. Our testing… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01176v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01176v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01176v1-abstract-full" style="display: none;"> This research addresses command-line embedding in cybersecurity, a field obstructed by the lack of comprehensive datasets due to privacy and regulation concerns. We propose the first dataset of similar command lines, named CyPHER, for training and unbiased evaluation. The training set is generated using a set of large language models (LLMs) comprising 28,520 similar command-line pairs. Our testing dataset consists of 2,807 similar command-line pairs sourced from authentic command-line data. In addition, we propose a command-line embedding model named CmdCaliper, enabling the computation of semantic similarity with command lines. Performance evaluations demonstrate that the smallest version of CmdCaliper (30 million parameters) suppresses state-of-the-art (SOTA) sentence embedding models with ten times more parameters across various tasks (e.g., malicious command-line detection and similar command-line retrieval). Our study explores the feasibility of data generation using LLMs in the cybersecurity domain. Furthermore, we release our proposed command-line dataset, embedding models' weights and all program codes to the public. This advancement paves the way for more effective command-line embedding for future researchers. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01176v1-abstract-full').style.display = 'none'; document.getElementById('2411.01176v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01171">arXiv:2411.01171</a> <span> [<a href="https://arxiv.org/pdf/2411.01171">pdf</a>, <a href="https://arxiv.org/format/2411.01171">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fast and Memory-Efficient Video Diffusion Using Streamlined Inference </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhan%2C+Z">Zheng Zhan</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+Y">Yushu Wu</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yifan Gong</a>, <a href="/search/cs?searchtype=author&query=Meng%2C+Z">Zichong Meng</a>, <a href="/search/cs?searchtype=author&query=Kong%2C+Z">Zhenglun Kong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Changdi Yang</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+G">Geng Yuan</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+P">Pu Zhao</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+W">Wei Niu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yanzhi Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01171v1-abstract-short" style="display: inline;"> The rapid progress in artificial intelligence-generated content (AIGC), especially with diffusion models, has significantly advanced development of high-quality video generation. However, current video diffusion models exhibit demanding computational requirements and high peak memory usage, especially for generating longer and higher-resolution videos. These limitations greatly hinder the practica… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01171v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01171v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01171v1-abstract-full" style="display: none;"> The rapid progress in artificial intelligence-generated content (AIGC), especially with diffusion models, has significantly advanced development of high-quality video generation. However, current video diffusion models exhibit demanding computational requirements and high peak memory usage, especially for generating longer and higher-resolution videos. These limitations greatly hinder the practical application of video diffusion models on standard hardware platforms. To tackle this issue, we present a novel, training-free framework named Streamlined Inference, which leverages the temporal and spatial properties of video diffusion models. Our approach integrates three core components: Feature Slicer, Operator Grouping, and Step Rehash. Specifically, Feature Slicer effectively partitions input features into sub-features and Operator Grouping processes each sub-feature with a group of consecutive operators, resulting in significant memory reduction without sacrificing the quality or speed. Step Rehash further exploits the similarity between adjacent steps in diffusion, and accelerates inference through skipping unnecessary steps. Extensive experiments demonstrate that our approach significantly reduces peak memory and computational overhead, making it feasible to generate high-quality videos on a single consumer GPU (e.g., reducing peak memory of AnimateDiff from 42GB to 11GB, featuring faster inference on 2080Ti). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01171v1-abstract-full').style.display = 'none'; document.getElementById('2411.01171v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted to NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01016">arXiv:2411.01016</a> <span> [<a href="https://arxiv.org/pdf/2411.01016">pdf</a>, <a href="https://arxiv.org/format/2411.01016">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> MoE-I$^2$: Compressing Mixture of Experts Models through Inter-Expert Pruning and Intra-Expert Low-Rank Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a>, <a href="/search/cs?searchtype=author&query=Sui%2C+Y">Yang Sui</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jinqi Xiao</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+L">Lingyi Huang</a>, <a href="/search/cs?searchtype=author&query=Gong%2C+Y">Yu Gong</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yuanlin Duan</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+W">Wenqi Jia</a>, <a href="/search/cs?searchtype=author&query=Yin%2C+M">Miao Yin</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+Y">Yu Cheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+B">Bo Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01016v1-abstract-short" style="display: inline;"> The emergence of Mixture of Experts (MoE) LLMs has significantly advanced the development of language models. Compared to traditional LLMs, MoE LLMs outperform traditional LLMs by achieving higher performance with considerably fewer activated parameters. Despite this efficiency, their enormous parameter size still leads to high deployment costs. In this paper, we introduce a two-stage compression… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01016v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01016v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01016v1-abstract-full" style="display: none;"> The emergence of Mixture of Experts (MoE) LLMs has significantly advanced the development of language models. Compared to traditional LLMs, MoE LLMs outperform traditional LLMs by achieving higher performance with considerably fewer activated parameters. Despite this efficiency, their enormous parameter size still leads to high deployment costs. In this paper, we introduce a two-stage compression method tailored for MoE to reduce the model size and decrease the computational cost. First, in the inter-expert pruning stage, we analyze the importance of each layer and propose the Layer-wise Genetic Search and Block-wise KT-Reception Field with the non-uniform pruning ratio to prune the individual expert. Second, in the intra-expert decomposition stage, we apply the low-rank decomposition to further compress the parameters within the remaining experts. Extensive experiments on Qwen1.5-MoE-A2.7B, DeepSeek-V2-Lite, and Mixtral-8$\times$7B demonstrate that our proposed methods can both reduce the model size and enhance inference efficiency while maintaining performance in various zero-shot tasks. The code will be available at \url{https://github.com/xiaochengsky/MoEI-2.git} <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01016v1-abstract-full').style.display = 'none'; document.getElementById('2411.01016v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00785">arXiv:2411.00785</a> <span> [<a href="https://arxiv.org/pdf/2411.00785">pdf</a>, <a href="https://arxiv.org/format/2411.00785">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IGOR: Image-GOal Representations are the Atomic Control Units for Foundation Models in Embodied AI </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiaoyu Chen</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Junliang Guo</a>, <a href="/search/cs?searchtype=author&query=He%2C+T">Tianyu He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chuheng Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+P">Pushi Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+D+C">Derek Cathera Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+L">Li Zhao</a>, <a href="/search/cs?searchtype=author&query=Bian%2C+J">Jiang Bian</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00785v1-abstract-short" style="display: inline;"> We introduce Image-GOal Representations (IGOR), aiming to learn a unified, semantically consistent action space across human and various robots. Through this unified latent action space, IGOR enables knowledge transfer among large-scale robot and human activity data. We achieve this by compressing visual changes between an initial image and its goal state into latent actions. IGOR allows us to gen… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00785v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00785v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00785v1-abstract-full" style="display: none;"> We introduce Image-GOal Representations (IGOR), aiming to learn a unified, semantically consistent action space across human and various robots. Through this unified latent action space, IGOR enables knowledge transfer among large-scale robot and human activity data. We achieve this by compressing visual changes between an initial image and its goal state into latent actions. IGOR allows us to generate latent action labels for internet-scale video data. This unified latent action space enables the training of foundation policy and world models across a wide variety of tasks performed by both robots and humans. We demonstrate that: (1) IGOR learns a semantically consistent action space for both human and robots, characterizing various possible motions of objects representing the physical interaction knowledge; (2) IGOR can "migrate" the movements of the object in the one video to other videos, even across human and robots, by jointly using the latent action model and world model; (3) IGOR can learn to align latent actions with natural language through the foundation policy model, and integrate latent actions with a low-level policy model to achieve effective robot control. We believe IGOR opens new possibilities for human-to-robot knowledge transfer and control. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00785v1-abstract-full').style.display = 'none'; document.getElementById('2411.00785v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00031">arXiv:2411.00031</a> <span> [<a href="https://arxiv.org/pdf/2411.00031">pdf</a>, <a href="https://arxiv.org/ps/2411.00031">ps</a>, <a href="https://arxiv.org/format/2411.00031">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Symbolic Computation">cs.SC</span> </div> </div> <p class="title is-5 mathjax"> A Theoretical Review on Solving Algebra Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+X">Xinguo Yu</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+W">Weina Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuanzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Ting Zhang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00031v1-abstract-short" style="display: inline;"> Solving algebra problems (APs) continues to attract significant research interest as evidenced by the large number of algorithms and theories proposed over the past decade. Despite these important research contributions, however, the body of work remains incomplete in terms of theoretical justification and scope. The current contribution intends to fill the gap by developing a review framework tha… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00031v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00031v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00031v1-abstract-full" style="display: none;"> Solving algebra problems (APs) continues to attract significant research interest as evidenced by the large number of algorithms and theories proposed over the past decade. Despite these important research contributions, however, the body of work remains incomplete in terms of theoretical justification and scope. The current contribution intends to fill the gap by developing a review framework that aims to lay a theoretical base, create an evaluation scheme, and extend the scope of the investigation. This paper first develops the State Transform Theory (STT), which emphasizes that the problem-solving algorithms are structured according to states and transforms unlike the understanding that underlies traditional surveys which merely emphasize the progress of transforms. The STT, thus, lays the theoretical basis for a new framework for reviewing algorithms. This new construct accommodates the relation-centric algorithms for solving both word and diagrammatic algebra problems. The latter not only highlights the necessity of introducing new states but also allows revelation of contributions of individual algorithms obscured in prior reviews without this approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00031v1-abstract-full').style.display = 'none'; document.getElementById('2411.00031v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">22pages,5figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23929">arXiv:2410.23929</a> <span> [<a href="https://arxiv.org/pdf/2410.23929">pdf</a>, <a href="https://arxiv.org/format/2410.23929">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Systems and Control">eess.SY</span> </div> </div> <p class="title is-5 mathjax"> Redundant Observer-Based Tracking Control for Object Extraction Using a Cable Connected UAV </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Marshall%2C+B+J">Benjamin J. Marshall</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+Y">Yunda Yan</a>, <a href="/search/cs?searchtype=author&query=Knowles%2C+J">James Knowles</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenguang Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+C">Cunjia Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23929v1-abstract-short" style="display: inline;"> A new disturbance observer based control scheme is developed for a quadrotor under the concurrent disturbances from a lightweight elastic tether cable and a lumped vertical disturbance. This elastic tether is unusual as it creates a disturbance proportional to the multicopter's translational movement. This paper takes an observer-based approach to estimate the stiffness coefficient of the cable an… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23929v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23929v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23929v1-abstract-full" style="display: none;"> A new disturbance observer based control scheme is developed for a quadrotor under the concurrent disturbances from a lightweight elastic tether cable and a lumped vertical disturbance. This elastic tether is unusual as it creates a disturbance proportional to the multicopter's translational movement. This paper takes an observer-based approach to estimate the stiffness coefficient of the cable and uses the system model to update the estimates of the external forces, which are then compensated in the control action. Given that the tethered cable force affects both horizontal channels of the quadrotor and is also coupled with the vertical channel, the proposed disturbance observer is constructed to exploit the redundant measurements across all three channels to jointly estimate the cable stiffness and the vertical disturbance. A pseudo-inverse method is used to determine the observer gain functions, such that the estimation of the two quantities is decoupled and stable. Compared to standard disturbance observers which assume nearly constant disturbances, the proposed approach can quickly adjust its total force estimate as the tethered quadrotor changes its position or tautness of the tether. This is applied to two experiments - a tracking performance test where the multicopter moves under a constant tether strain, and an object extraction test. In the second test, the multicopter manipulates a nonlinear mechanism mimicking the extraction of a wedged object. In both cases, the proposed approach shows significant improvement over standard Disturbance Observer and Extended State Observer approaches. A video summary of the experiments can be found at https://youtu.be/9gKr13WTj-k. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23929v1-abstract-full').style.display = 'none'; document.getElementById('2410.23929v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23692">arXiv:2410.23692</a> <span> [<a href="https://arxiv.org/pdf/2410.23692">pdf</a>, <a href="https://arxiv.org/format/2410.23692">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computers and Society">cs.CY</span> </div> </div> <p class="title is-5 mathjax"> Instruction-Tuning Llama-3-8B Excels in City-Scale Mobility Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Tang%2C+P">Peizhi Tang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuang Yang</a>, <a href="/search/cs?searchtype=author&query=Xing%2C+T">Tong Xing</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+X">Xiaohang Xu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+R">Renhe Jiang</a>, <a href="/search/cs?searchtype=author&query=Sezaki%2C+K">Kaoru Sezaki</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23692v1-abstract-short" style="display: inline;"> Human mobility prediction plays a critical role in applications such as disaster response, urban planning, and epidemic forecasting. Traditional methods often rely on designing crafted, domain-specific models, and typically focus on short-term predictions, which struggle to generalize across diverse urban environments. In this study, we introduce Llama-3-8B-Mob, a large language model fine-tuned w… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23692v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23692v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23692v1-abstract-full" style="display: none;"> Human mobility prediction plays a critical role in applications such as disaster response, urban planning, and epidemic forecasting. Traditional methods often rely on designing crafted, domain-specific models, and typically focus on short-term predictions, which struggle to generalize across diverse urban environments. In this study, we introduce Llama-3-8B-Mob, a large language model fine-tuned with instruction tuning, for long-term citywide mobility prediction -- in a Q&A manner. We validate our approach using large-scale human mobility data from four metropolitan areas in Japan, focusing on predicting individual trajectories over the next 15 days. The results demonstrate that Llama-3-8B-Mob excels in modeling long-term human mobility -- surpassing the state-of-the-art on multiple prediction metrics. It also displays strong zero-shot generalization capabilities -- effectively generalizing to other cities even when fine-tuned only on limited samples from a single city. Source codes are available at https://github.com/TANGHULU6/Llama3-8B-Mob. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23692v1-abstract-full').style.display = 'none'; document.getElementById('2410.23692v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23686">arXiv:2410.23686</a> <span> [<a href="https://arxiv.org/pdf/2410.23686">pdf</a>, <a href="https://arxiv.org/format/2410.23686">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Towards Dynamic Message Passing on Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sun%2C+J">Junshu Sun</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chenxue Yang</a>, <a href="/search/cs?searchtype=author&query=Ji%2C+X">Xiangyang Ji</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+Q">Qingming Huang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shuhui Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23686v1-abstract-short" style="display: inline;"> Message passing plays a vital role in graph neural networks (GNNs) for effective feature learning. However, the over-reliance on input topology diminishes the efficacy of message passing and restricts the ability of GNNs. Despite efforts to mitigate the reliance, existing study encounters message-passing bottlenecks or high computational expense problems, which invokes the demands for flexible mes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23686v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23686v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23686v1-abstract-full" style="display: none;"> Message passing plays a vital role in graph neural networks (GNNs) for effective feature learning. However, the over-reliance on input topology diminishes the efficacy of message passing and restricts the ability of GNNs. Despite efforts to mitigate the reliance, existing study encounters message-passing bottlenecks or high computational expense problems, which invokes the demands for flexible message passing with low complexity. In this paper, we propose a novel dynamic message-passing mechanism for GNNs. It projects graph nodes and learnable pseudo nodes into a common space with measurable spatial relations between them. With nodes moving in the space, their evolving relations facilitate flexible pathway construction for a dynamic message-passing process. Associating pseudo nodes to input graphs with their measured relations, graph nodes can communicate with each other intermediately through pseudo nodes under linear complexity. We further develop a GNN model named $\mathtt{\mathbf{N^2}}$ based on our dynamic message-passing mechanism. $\mathtt{\mathbf{N^2}}$ employs a single recurrent layer to recursively generate the displacements of nodes and construct optimal dynamic pathways. Evaluation on eighteen benchmarks demonstrates the superior performance of $\mathtt{\mathbf{N^2}}$ over popular GNNs. $\mathtt{\mathbf{N^2}}$ successfully scales to large-scale benchmarks and requires significantly fewer parameters for graph classification with the shared recurrent layer. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23686v1-abstract-full').style.display = 'none'; document.getElementById('2410.23686v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22901">arXiv:2410.22901</a> <span> [<a href="https://arxiv.org/pdf/2410.22901">pdf</a>, <a href="https://arxiv.org/format/2410.22901">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> HelloMeme: Integrating Spatial Knitting Attentions to Embed High-Level and Fidelity-Rich Conditions in Diffusion Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shengkai Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+N">Nianhong Jiao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tian Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaojie Yang</a>, <a href="/search/cs?searchtype=author&query=Xue%2C+C">Chenhui Xue</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+B">Boya Niu</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+J">Jun Gao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22901v1-abstract-short" style="display: inline;"> We propose an effective method for inserting adapters into text-to-image foundation models, which enables the execution of complex downstream tasks while preserving the generalization ability of the base model. The core idea of this method is to optimize the attention mechanism related to 2D feature maps, which enhances the performance of the adapter. This approach was validated on the task of mem… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22901v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22901v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22901v1-abstract-full" style="display: none;"> We propose an effective method for inserting adapters into text-to-image foundation models, which enables the execution of complex downstream tasks while preserving the generalization ability of the base model. The core idea of this method is to optimize the attention mechanism related to 2D feature maps, which enhances the performance of the adapter. This approach was validated on the task of meme video generation and achieved significant results. We hope this work can provide insights for post-training tasks of large text-to-image models. Additionally, as this method demonstrates good compatibility with SD1.5 derivative models, it holds certain value for the open-source community. Therefore, we will release the related code (\url{https://songkey.github.io/hellomeme}). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22901v1-abstract-full').style.display = 'none'; document.getElementById('2410.22901v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures, 2 tables</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 68T07 (Primary) 68T10 <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.4.5; I.5.0 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.22306">arXiv:2410.22306</a> <span> [<a href="https://arxiv.org/pdf/2410.22306">pdf</a>, <a href="https://arxiv.org/format/2410.22306">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Multi-Object 3D Grounding with Dynamic Modules and Language-Informed Spatial Attention </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Haomeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chiao-An Yang</a>, <a href="/search/cs?searchtype=author&query=Yeh%2C+R+A">Raymond A. Yeh</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.22306v1-abstract-short" style="display: inline;"> Multi-object 3D Grounding involves locating 3D boxes based on a given query phrase from a point cloud. It is a challenging and significant task with numerous applications in visual understanding, human-computer interaction, and robotics. To tackle this challenge, we introduce D-LISA, a two-stage approach incorporating three innovations. First, a dynamic vision module that enables a variable and le… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22306v1-abstract-full').style.display = 'inline'; document.getElementById('2410.22306v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.22306v1-abstract-full" style="display: none;"> Multi-object 3D Grounding involves locating 3D boxes based on a given query phrase from a point cloud. It is a challenging and significant task with numerous applications in visual understanding, human-computer interaction, and robotics. To tackle this challenge, we introduce D-LISA, a two-stage approach incorporating three innovations. First, a dynamic vision module that enables a variable and learnable number of box proposals. Second, a dynamic camera positioning that extracts features for each proposal. Third, a language-informed spatial attention module that better reasons over the proposals to output the final prediction. Empirically, experiments show that our method outperforms the state-of-the-art methods on multi-object 3D grounding by 12.8% (absolute) and is competitive in single-object 3D grounding. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.22306v1-abstract-full').style.display = 'none'; document.getElementById('2410.22306v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">NeurIPS 2024</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.20638">arXiv:2410.20638</a> <span> [<a href="https://arxiv.org/pdf/2410.20638">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ant Detective: An Automated Approach for Counting Ants in Densely Populated Images and Gaining Insight into Ant Foraging Behavior </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Das%2C+M">Mautushi Das</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+F+C">Fang-Ling Chloe Liu</a>, <a href="/search/cs?searchtype=author&query=Hartle%2C+C">Charly Hartle</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C+S">Chin-Cheng Scotty Yang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+P+J">C. P. James Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.20638v1-abstract-short" style="display: inline;"> Ant foraging behavior is essential to understanding ecological dynamics and developing effective pest management strategies, but quantifying this behavior is challenging due to the labor-intensive nature of manual counting, especially in densely populated images. This study presents an automated approach using computer vision to count ants and analyze their foraging behavior. Leveraging the YOLOv8… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20638v1-abstract-full').style.display = 'inline'; document.getElementById('2410.20638v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.20638v1-abstract-full" style="display: none;"> Ant foraging behavior is essential to understanding ecological dynamics and developing effective pest management strategies, but quantifying this behavior is challenging due to the labor-intensive nature of manual counting, especially in densely populated images. This study presents an automated approach using computer vision to count ants and analyze their foraging behavior. Leveraging the YOLOv8 model, the system was calibrated and evaluated on datasets encompassing various imaging scenarios and densities. The study results demonstrate that the system achieves average precision and recall of up to 87.96% and 87,78%, respectively, with only 64 calibration images provided when the both calibration and evaluation images share similar imaging backgrounds. When the background is more complex than the calibration images, the system requires a larger calibration set to generalize effectively, with 1,024 images yielding the precision and recall of up to 83.60% and 78.88, respectively. In more challenging scenarios where more than one thousand ants are present in a single image, the system significantly improves detection accuracy by slicing images into smaller patches, reaching a precision and recall of 77.97% and 71.36%, respectively. The system's ability to generate heatmaps visualizes the spatial distribution of ant activity over time, providing valuable insights into their foraging patterns. This spatial-temporal analysis enables a more comprehensive understanding of ant behavior, which is crucial for ecological studies and improving pest control methods. By automating the counting process and offering detailed behavioral analysis, this study provides an efficient tool for researchers and pest control professionals to develop more effective strategies. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.20638v1-abstract-full').style.display = 'none'; document.getElementById('2410.20638v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 27 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.19214">arXiv:2410.19214</a> <span> [<a href="https://arxiv.org/pdf/2410.19214">pdf</a>, <a href="https://arxiv.org/format/2410.19214">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> A Comprehensive Analysis of Social Tie Strength: Definitions, Prediction Methods, and Future Directions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xueqi Cheng</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Catherine Yang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yuying Zhao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Y">Yu Wang</a>, <a href="/search/cs?searchtype=author&query=Karimi%2C+H">Hamid Karimi</a>, <a href="/search/cs?searchtype=author&query=Derr%2C+T">Tyler Derr</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.19214v2-abstract-short" style="display: inline;"> The rapid growth of online social networks has underscored the importance of understanding the intensity of user relationships, referred to as "tie strength." Over the past few decades, extensive efforts have been made to assess tie strength in networks. However, the lack of ground-truth tie strength labels and the differing perspectives on tie strength among researchers have complicated the devel… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19214v2-abstract-full').style.display = 'inline'; document.getElementById('2410.19214v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.19214v2-abstract-full" style="display: none;"> The rapid growth of online social networks has underscored the importance of understanding the intensity of user relationships, referred to as "tie strength." Over the past few decades, extensive efforts have been made to assess tie strength in networks. However, the lack of ground-truth tie strength labels and the differing perspectives on tie strength among researchers have complicated the development of effective prediction methods for real-world applications. In our study, we first categorize mainstream understandings of tie strength into seven standardized definitions and verify their effectiveness by investigating the class distributions and correlations across these definitions. We also draw key insights into tie resilience from the perspective of tie dissolution that (1) stronger ties are more resilient than weaker ones, and (2) this tie resiliency ratio increases as the network evolves. We then conduct extensive experiments to evaluate existing tie strength prediction methods under these definitions, revealing that (1) neural network methods capable of learning from semantic features hold great potential for high performance, (2) models struggle under definitions that offer limited understandings of tie strength in the network, (3) existing models face imbalance issues that cannot be addressed by traditional quantity imbalance techniques, and (4) different definitions of tie strength allow for the inference of not only the current state but also the future state of a tie. Building on these findings, we propose strategies to improve existing methods and suggest several promising directions for future research. Code and datasets are provided at https://github.com/XueqiC/tie_strength_prediction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.19214v2-abstract-full').style.display = 'none'; document.getElementById('2410.19214v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 29 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18477">arXiv:2410.18477</a> <span> [<a href="https://arxiv.org/pdf/2410.18477">pdf</a>, <a href="https://arxiv.org/format/2410.18477">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Monge-Ampere Regularization for Learning Arbitrary Shapes from Point Clouds </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chuanxiang Yang</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Y">Yuanfeng Zhou</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+G">Guangshun Wei</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Long Ma</a>, <a href="/search/cs?searchtype=author&query=Hou%2C+J">Junhui Hou</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18477v1-abstract-short" style="display: inline;"> As commonly used implicit geometry representations, the signed distance function (SDF) is limited to modeling watertight shapes, while the unsigned distance function (UDF) is capable of representing various surfaces. However, its inherent theoretical shortcoming, i.e., the non-differentiability at the zero level set, would result in sub-optimal reconstruction quality. In this paper, we propose the… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18477v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18477v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18477v1-abstract-full" style="display: none;"> As commonly used implicit geometry representations, the signed distance function (SDF) is limited to modeling watertight shapes, while the unsigned distance function (UDF) is capable of representing various surfaces. However, its inherent theoretical shortcoming, i.e., the non-differentiability at the zero level set, would result in sub-optimal reconstruction quality. In this paper, we propose the scaled-squared distance function (S$^{2}$DF), a novel implicit surface representation for modeling arbitrary surface types. S$^{2}$DF does not distinguish between inside and outside regions while effectively addressing the non-differentiability issue of UDF at the zero level set. We demonstrate that S$^{2}$DF satisfies a second-order partial differential equation of Monge-Ampere-type, allowing us to develop a learning pipeline that leverages a novel Monge-Ampere regularization to directly learn S$^{2}$DF from raw unoriented point clouds without supervision from ground-truth S$^{2}$DF values. Extensive experiments across multiple datasets show that our method significantly outperforms state-of-the-art supervised approaches that require ground-truth surface information as supervision for training. The code will be publicly available at https://github.com/chuanxiang-yang/S2DF. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18477v1-abstract-full').style.display = 'none'; document.getElementById('2410.18477v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18329">arXiv:2410.18329</a> <span> [<a href="https://arxiv.org/pdf/2410.18329">pdf</a>, <a href="https://arxiv.org/format/2410.18329">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Human-Computer Interaction">cs.HC</span> </div> </div> <p class="title is-5 mathjax"> When Group Spirit Meets Personal Journeys: Exploring Motivational Dynamics and Design Opportunities in Group Therapy </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Geng%2C+S">Shixian Geng</a>, <a href="/search/cs?searchtype=author&query=Shimojima%2C+G">Ginshi Shimojima</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chi-Lan Yang</a>, <a href="/search/cs?searchtype=author&query=Sramek%2C+Z">Zefan Sramek</a>, <a href="/search/cs?searchtype=author&query=Norihama%2C+S">Shunpei Norihama</a>, <a href="/search/cs?searchtype=author&query=Takano%2C+A">Ayumi Takano</a>, <a href="/search/cs?searchtype=author&query=Hosio%2C+S">Simo Hosio</a>, <a href="/search/cs?searchtype=author&query=Yatani%2C+K">Koji Yatani</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18329v1-abstract-short" style="display: inline;"> Psychotherapy, such as cognitive-behavioral therapy (CBT), is effective in treating various mental disorders. Technology-facilitated mental health therapy improves client engagement through methods like digitization or gamification. However, these innovations largely cater to individual therapy, ignoring the potential of group therapy-a treatment for multiple clients concurrently, which enables in… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18329v1-abstract-full').style.display = 'inline'; document.getElementById('2410.18329v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18329v1-abstract-full" style="display: none;"> Psychotherapy, such as cognitive-behavioral therapy (CBT), is effective in treating various mental disorders. Technology-facilitated mental health therapy improves client engagement through methods like digitization or gamification. However, these innovations largely cater to individual therapy, ignoring the potential of group therapy-a treatment for multiple clients concurrently, which enables individual clients to receive various perspectives in the treatment process and also addresses the scarcity of healthcare practitioners to reduce costs. Notwithstanding its cost-effectiveness and unique social dynamics that foster peer learning and community support, group therapy, such as group CBT, faces the issue of attrition. While existing medical work has developed guidelines for therapists, such as establishing leadership and empathy to facilitate group therapy, understanding about the interactions between each stakeholder is still missing. To bridge this gap, this study examined a group CBT program called the Serigaya Methamphetamine Relapse Prevention Program (SMARPP) as a case study to understand stakeholder coordination and communication, along with factors promoting and hindering continuous engagement in group therapy. In-depth interviews with eight facilitators and six former clients from SMARPP revealed the motivators and demotivators for facilitator-facilitator, client-client, and facilitator-client communications. Our investigation uncovers the presence of discernible conflicts between clients' intrapersonal motivation as well as interpersonal motivation in the context of group therapy through the lens of self-determination theory. We discuss insights and research opportunities for the HCI community to mediate such tension and enhance stakeholder communication in future technology-assisted group therapy settings. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18329v1-abstract-full').style.display = 'none'; document.getElementById('2410.18329v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.18032">arXiv:2410.18032</a> <span> [<a href="https://arxiv.org/pdf/2410.18032">pdf</a>, <a href="https://arxiv.org/format/2410.18032">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> GraphTeam: Facilitating Large Language Model-based Graph Analysis via Multi-Agent Collaboration </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+X">Xin Li</a>, <a href="/search/cs?searchtype=author&query=Chu%2C+Q">Qizhi Chu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yubin Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yang Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yaoqi Liu</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+Z">Zekai Yu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Weize Chen</a>, <a href="/search/cs?searchtype=author&query=Qian%2C+C">Chen Qian</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+C">Chuan Shi</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cheng Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.18032v2-abstract-short" style="display: inline;"> Graphs are widely used for modeling relational data in real-world scenarios, such as social networks and urban computing. Existing LLM-based graph analysis approaches either integrate graph neural networks (GNNs) for specific machine learning tasks, limiting their transferability, or rely solely on LLMs' internal reasoning ability, resulting in suboptimal performance. To address these limitations,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18032v2-abstract-full').style.display = 'inline'; document.getElementById('2410.18032v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.18032v2-abstract-full" style="display: none;"> Graphs are widely used for modeling relational data in real-world scenarios, such as social networks and urban computing. Existing LLM-based graph analysis approaches either integrate graph neural networks (GNNs) for specific machine learning tasks, limiting their transferability, or rely solely on LLMs' internal reasoning ability, resulting in suboptimal performance. To address these limitations, we take advantage of recent advances in LLM-based agents, which have shown capabilities of utilizing external knowledge or tools for problem solving. By simulating human problem-solving strategies such as analogy and collaboration, we propose a multi-agent system based on LLMs named GraphTeam, for graph analysis. GraphTeam consists of five LLM-based agents from three modules, and the agents with different specialities can collaborate with each other to address complex problems. Specifically, (1) input-output normalization module: the question agent extracts and refines four key arguments from the original question, facilitating the problem understanding, and the answer agent organizes the results to meet the output requirement; (2) external knowledge retrieval module: we first build a knowledge base consisting of relevant documentation and experience information, and then the search agent retrieves the most relevant entries for each question. (3) problem-solving module: given the retrieved information from search agent, the coding agent uses established algorithms via programming to generate solutions, and in case the coding agent does not work, the reasoning agent will directly compute the results without programming. Extensive experiments on six graph analysis benchmarks demonstrate that GraphTeam achieves state-of-the-art performance with an average 25.85% improvement over the best baseline in terms of accuracy. The code and data are available at https://github.com/BUPT-GAMMA/GraphTeam. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.18032v2-abstract-full').style.display = 'none'; document.getElementById('2410.18032v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.17952">arXiv:2410.17952</a> <span> [<a href="https://arxiv.org/pdf/2410.17952">pdf</a>, <a href="https://arxiv.org/format/2410.17952">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> SimRAG: Self-Improving Retrieval-Augmented Generation for Adapting Large Language Models to Specialized Domains </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Xu%2C+R">Ran Xu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+H">Hui Liu</a>, <a href="/search/cs?searchtype=author&query=Nag%2C+S">Sreyashi Nag</a>, <a href="/search/cs?searchtype=author&query=Dai%2C+Z">Zhenwei Dai</a>, <a href="/search/cs?searchtype=author&query=Xie%2C+Y">Yaochen Xie</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+X">Xianfeng Tang</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+C">Chen Luo</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yang Li</a>, <a href="/search/cs?searchtype=author&query=Ho%2C+J+C">Joyce C. Ho</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Carl Yang</a>, <a href="/search/cs?searchtype=author&query=He%2C+Q">Qi He</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.17952v1-abstract-short" style="display: inline;"> Retrieval-augmented generation (RAG) enhances the question-answering (QA) abilities of large language models (LLMs) by integrating external knowledge. However, adapting general-purpose RAG systems to specialized fields such as science and medicine poses unique challenges due to distribution shifts and limited access to domain-specific data. To tackle this, we propose SimRAG, a self-training approa… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17952v1-abstract-full').style.display = 'inline'; document.getElementById('2410.17952v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.17952v1-abstract-full" style="display: none;"> Retrieval-augmented generation (RAG) enhances the question-answering (QA) abilities of large language models (LLMs) by integrating external knowledge. However, adapting general-purpose RAG systems to specialized fields such as science and medicine poses unique challenges due to distribution shifts and limited access to domain-specific data. To tackle this, we propose SimRAG, a self-training approach that equips the LLM with joint capabilities of question answering and question generation for domain adaptation. Our method first fine-tunes the LLM on instruction-following, question-answering, and search-related data. Then, it prompts the same LLM to generate diverse domain-relevant questions from unlabeled corpora, with an additional filtering strategy to retain high-quality synthetic examples. By leveraging these synthetic examples, the LLM can improve their performance on domain-specific RAG tasks. Experiments on 11 datasets, spanning two backbone sizes and three domains, demonstrate that SimRAG outperforms baselines by 1.2\%--8.6\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.17952v1-abstract-full').style.display = 'none'; document.getElementById('2410.17952v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Work in Progress</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.16803">arXiv:2410.16803</a> <span> [<a href="https://arxiv.org/pdf/2410.16803">pdf</a>, <a href="https://arxiv.org/format/2410.16803">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Context-aware Inductive Knowledge Graph Completion with Latent Type Constraints and Subgraph Reasoning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+M">Muzhi Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Cehao Yang</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+C">Chengjin Xu</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zixing Song</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+X">Xuhui Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+J">Jian Guo</a>, <a href="/search/cs?searchtype=author&query=Leung%2C+H">Ho-fung Leung</a>, <a href="/search/cs?searchtype=author&query=King%2C+I">Irwin King</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.16803v2-abstract-short" style="display: inline;"> Inductive knowledge graph completion (KGC) aims to predict missing triples with unseen entities. Recent works focus on modeling reasoning paths between the head and tail entity as direct supporting evidence. However, these methods depend heavily on the existence and quality of reasoning paths, which limits their general applicability in different scenarios. In addition, we observe that latent type… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16803v2-abstract-full').style.display = 'inline'; document.getElementById('2410.16803v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.16803v2-abstract-full" style="display: none;"> Inductive knowledge graph completion (KGC) aims to predict missing triples with unseen entities. Recent works focus on modeling reasoning paths between the head and tail entity as direct supporting evidence. However, these methods depend heavily on the existence and quality of reasoning paths, which limits their general applicability in different scenarios. In addition, we observe that latent type constraints and neighboring facts inherent in KGs are also vital in inferring missing triples. To effectively utilize all useful information in KGs, we introduce CATS, a novel context-aware inductive KGC solution. With sufficient guidance from proper prompts and supervised fine-tuning, CATS activates the strong semantic understanding and reasoning capabilities of large language models to assess the existence of query triples, which consist of two modules. First, the type-aware reasoning module evaluates whether the candidate entity matches the latent entity type as required by the query relation. Then, the subgraph reasoning module selects relevant reasoning paths and neighboring facts, and evaluates their correlation to the query triple. Experiment results on three widely used datasets demonstrate that CATS significantly outperforms state-of-the-art methods in 16 out of 18 transductive, inductive, and few-shot settings with an average absolute MRR improvement of 7.2%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.16803v2-abstract-full').style.display = 'none'; document.getElementById('2410.16803v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 22 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Yang%2C+C&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Yang%2C+C&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet">  <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a>  </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary">  <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div>   <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div>  </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>

CINXE.COM

Search | arXiv e-print repository