CINXE.COM
Search | arXiv e-print repository
<!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8"/> <meta name="viewport" content="width=device-width, initial-scale=1"/> <!-- new favicon config and versions by realfavicongenerator.net --> <link rel="apple-touch-icon" sizes="180x180" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon-16x16.png"> <link rel="manifest" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/site.webmanifest"> <link rel="mask-icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/safari-pinned-tab.svg" color="#b31b1b"> <link rel="shortcut icon" href="https://static.arxiv.org/static/base/1.0.0a5/images/icons/favicon.ico"> <meta name="msapplication-TileColor" content="#b31b1b"> <meta name="msapplication-config" content="images/icons/browserconfig.xml"> <meta name="theme-color" content="#b31b1b"> <!-- end favicon config --> <title>Search | arXiv e-print repository</title> <script defer src="https://static.arxiv.org/static/base/1.0.0a5/fontawesome-free-5.11.2-web/js/all.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/base/1.0.0a5/css/arxivstyle.css" /> <script type="text/x-mathjax-config"> MathJax.Hub.Config({ messageStyle: "none", extensions: ["tex2jax.js"], jax: ["input/TeX", "output/HTML-CSS"], tex2jax: { inlineMath: [ ['$','$'], ["\\(","\\)"] ], displayMath: [ ['$$','$$'], ["\\[","\\]"] ], processEscapes: true, ignoreClass: '.*', processClass: 'mathjax.*' }, TeX: { extensions: ["AMSmath.js", "AMSsymbols.js", "noErrors.js"], noErrors: { inlineDelimiters: ["$","$"], multiLine: false, style: { "font-size": "normal", "border": "" } } }, "HTML-CSS": { availableFonts: ["TeX"] } }); </script> <script src='//static.arxiv.org/MathJax-2.7.3/MathJax.js'></script> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/notification.js"></script> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/bulma-tooltip.min.css" /> <link rel="stylesheet" href="https://static.arxiv.org/static/search/0.5.6/css/search.css" /> <script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g=" crossorigin="anonymous"></script> <script src="https://static.arxiv.org/static/search/0.5.6/js/fieldset.js"></script> <style> radio#cf-customfield_11400 { display: none; } </style> </head> <body> <header><a href="#main-container" class="is-sr-only">Skip to main content</a> <!-- contains Cornell logo and sponsor statement --> <div class="attribution level is-marginless" role="banner"> <div class="level-left"> <a class="level-item" href="https://cornell.edu/"><img src="https://static.arxiv.org/static/base/1.0.0a5/images/cornell-reduced-white-SMALL.svg" alt="Cornell University" width="200" aria-label="logo" /></a> </div> <div class="level-right is-marginless"><p class="sponsors level-item is-marginless"><span id="support-ack-url">We gratefully acknowledge support from<br /> the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors. <a href="https://info.arxiv.org/about/donate.html">Donate</a></span></p></div> </div> <!-- contains arXiv identity and search bar --> <div class="identity level is-marginless"> <div class="level-left"> <div class="level-item"> <a class="arxiv" href="https://arxiv.org/" aria-label="arxiv-logo"> <img src="https://static.arxiv.org/static/base/1.0.0a5/images/arxiv-logo-one-color-white.svg" aria-label="logo" alt="arxiv logo" width="85" style="width:85px;"/> </a> </div> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div> <!-- closes identity --> <div class="container"> <div class="user-tools is-size-7 has-text-right has-text-weight-bold" role="navigation" aria-label="User menu"> <a href="https://arxiv.org/login">Login</a> </div> </div> </header> <main class="container" id="main-container"> <div class="level is-marginless"> <div class="level-left"> <h1 class="title is-clearfix"> Showing 1–50 of 2,546 results for author: <span class="mathjax">Chen, L</span> </h1> </div> <div class="level-right is-hidden-mobile"> <!-- feedback for mobile is moved to footer --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> <div class="content"> <form method="GET" action="/search/cs" aria-role="search"> Searching in archive <strong>cs</strong>. <a href="/search/?searchtype=author&query=Chen%2C+L">Search in all archives.</a> <div class="field has-addons-tablet"> <div class="control is-expanded"> <label for="query" class="hidden-label">Search term or terms</label> <input class="input is-medium" id="query" name="query" placeholder="Search term..." type="text" value="Chen, L"> </div> <div class="select control is-medium"> <label class="is-hidden" for="searchtype">Field</label> <select class="is-medium" id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> </div> <div class="control"> <button class="button is-link is-medium">Search</button> </div> </div> <div class="field"> <div class="control is-size-7"> <label class="radio"> <input checked id="abstracts-0" name="abstracts" type="radio" value="show"> Show abstracts </label> <label class="radio"> <input id="abstracts-1" name="abstracts" type="radio" value="hide"> Hide abstracts </label> </div> </div> <div class="is-clearfix" style="height: 2.5em"> <div class="is-pulled-right"> <a href="/search/advanced?terms-0-term=Chen%2C+L&terms-0-field=author&size=50&order=-announced_date_first">Advanced Search</a> </div> </div> <input type="hidden" name="order" value="-announced_date_first"> <input type="hidden" name="size" value="50"> </form> <div class="level breathe-horizontal"> <div class="level-left"> <form method="GET" action="/search/"> <div style="display: none;"> <select id="searchtype" name="searchtype"><option value="all">All fields</option><option value="title">Title</option><option selected value="author">Author(s)</option><option value="abstract">Abstract</option><option value="comments">Comments</option><option value="journal_ref">Journal reference</option><option value="acm_class">ACM classification</option><option value="msc_class">MSC classification</option><option value="report_num">Report number</option><option value="paper_id">arXiv identifier</option><option value="doi">DOI</option><option value="orcid">ORCID</option><option value="license">License (URI)</option><option value="author_id">arXiv author ID</option><option value="help">Help pages</option><option value="full_text">Full text</option></select> <input id="query" name="query" type="text" value="Chen, L"> <ul id="abstracts"><li><input checked id="abstracts-0" name="abstracts" type="radio" value="show"> <label for="abstracts-0">Show abstracts</label></li><li><input id="abstracts-1" name="abstracts" type="radio" value="hide"> <label for="abstracts-1">Hide abstracts</label></li></ul> </div> <div class="box field is-grouped is-grouped-multiline level-item"> <div class="control"> <span class="select is-small"> <select id="size" name="size"><option value="25">25</option><option selected value="50">50</option><option value="100">100</option><option value="200">200</option></select> </span> <label for="size">results per page</label>. </div> <div class="control"> <label for="order">Sort results by</label> <span class="select is-small"> <select id="order" name="order"><option selected value="-announced_date_first">Announcement date (newest first)</option><option value="announced_date_first">Announcement date (oldest first)</option><option value="-submitted_date">Submission date (newest first)</option><option value="submitted_date">Submission date (oldest first)</option><option value="">Relevance</option></select> </span> </div> <div class="control"> <button class="button is-small is-link">Go</button> </div> </div> </form> </div> </div> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <ol class="breathe-horizontal" start="1"> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17595">arXiv:2411.17595</a> <span> [<a href="https://arxiv.org/pdf/2411.17595">pdf</a>, <a href="https://arxiv.org/ps/2411.17595">ps</a>, <a href="https://arxiv.org/format/2411.17595">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Applications">stat.AP</span> </div> </div> <p class="title is-5 mathjax"> Can artificial intelligence predict clinical trial outcomes? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Jin%2C+S">Shuyi Jin</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&query=Ding%2C+H">Hongru Ding</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meijie Wang</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+L">Lun Yu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17595v1-abstract-short" style="display: inline;"> The increasing complexity and cost of clinical trials, particularly in the context of oncology and advanced therapies, pose significant challenges for drug development. This study evaluates the predictive capabilities of large language models (LLMs) such as GPT-3.5, GPT-4, and HINT in determining clinical trial outcomes. By leveraging a curated dataset of trials from ClinicalTrials.gov, we compare… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17595v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17595v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17595v1-abstract-full" style="display: none;"> The increasing complexity and cost of clinical trials, particularly in the context of oncology and advanced therapies, pose significant challenges for drug development. This study evaluates the predictive capabilities of large language models (LLMs) such as GPT-3.5, GPT-4, and HINT in determining clinical trial outcomes. By leveraging a curated dataset of trials from ClinicalTrials.gov, we compare the models' performance using metrics including balanced accuracy, specificity, recall, and Matthews Correlation Coefficient (MCC). Results indicate that GPT-4o demonstrates robust performance in early trial phases, achieving high recall but facing limitations in specificity. Conversely, the HINT model excels in recognizing negative outcomes, particularly in later trial phases, offering a balanced approach across diverse endpoints. Oncology trials, characterized by high complexity, remain challenging for all models. Additionally, trial duration and disease categories influence predictive performance, with longer durations and complex diseases such as neoplasms reducing accuracy. This study highlights the complementary strengths of LLMs and HINT, providing insights into optimizing predictive tools for clinical trial design and risk management. Future advancements in LLMs are essential to address current gaps in handling negative outcomes and complex domains. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17595v1-abstract-full').style.display = 'none'; document.getElementById('2411.17595v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17521">arXiv:2411.17521</a> <span> [<a href="https://arxiv.org/pdf/2411.17521">pdf</a>, <a href="https://arxiv.org/format/2411.17521">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> BESTAnP: Bi-Step Efficient and Statistically Optimal Estimator for Acoustic-n-Point Problem </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Sheng%2C+W">Wenliang Sheng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Hongxu Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lingpeng Chen</a>, <a href="/search/cs?searchtype=author&query=Zeng%2C+G">Guangyang Zeng</a>, <a href="/search/cs?searchtype=author&query=Shao%2C+Y">Yunling Shao</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Y">Yuze Hong</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chao Yang</a>, <a href="/search/cs?searchtype=author&query=Hong%2C+Z">Ziyang Hong</a>, <a href="/search/cs?searchtype=author&query=Wu%2C+J">Junfeng Wu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17521v1-abstract-short" style="display: inline;"> We consider the acoustic-n-point (AnP) problem, which estimates the pose of a 2D forward-looking sonar (FLS) according to n 3D-2D point correspondences. We explore the nature of the measured partial spherical coordinates and reveal their inherent relationships to translation and orientation. Based on this, we propose a bi-step efficient and statistically optimal AnP (BESTAnP) algorithm that decoup… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17521v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17521v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17521v1-abstract-full" style="display: none;"> We consider the acoustic-n-point (AnP) problem, which estimates the pose of a 2D forward-looking sonar (FLS) according to n 3D-2D point correspondences. We explore the nature of the measured partial spherical coordinates and reveal their inherent relationships to translation and orientation. Based on this, we propose a bi-step efficient and statistically optimal AnP (BESTAnP) algorithm that decouples the estimation of translation and orientation. Specifically, in the first step, the translation estimation is formulated as the range-based localization problem based on distance-only measurements. In the second step, the rotation is estimated via eigendecomposition based on azimuth-only measurements and the estimated translation. BESTAnP is the first AnP algorithm that gives a closed-form solution for the full six-degree pose. In addition, we conduct bias elimination for BESTAnP such that it owns the statistical property of consistency. Through simulation and real-world experiments, we demonstrate that compared with the state-of-the-art (SOTA) methods, BESTAnP is over ten times faster and features real-time capacity in resource-constrained platforms while exhibiting comparable accuracy. Moreover, for the first time, we embed BESTAnP into a sonar-based odometry which shows its effectiveness for trajectory estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17521v1-abstract-full').style.display = 'none'; document.getElementById('2411.17521v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17459">arXiv:2411.17459</a> <span> [<a href="https://arxiv.org/pdf/2411.17459">pdf</a>, <a href="https://arxiv.org/format/2411.17459">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> WF-VAE: Enhancing Video VAE by Wavelet-Driven Energy Flow for Latent Video Diffusion Model </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zongjian Li</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bin Lin</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+Y">Yang Ye</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Cheng%2C+X">Xinhua Cheng</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17459v1-abstract-short" style="display: inline;"> Video Variational Autoencoder (VAE) encodes videos into a low-dimensional latent space, becoming a key component of most Latent Video Diffusion Models (LVDMs) to reduce model training costs. However, as the resolution and duration of generated videos increase, the encoding cost of Video VAEs becomes a limiting bottleneck in training LVDMs. Moreover, the block-wise inference method adopted by most… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17459v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17459v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17459v1-abstract-full" style="display: none;"> Video Variational Autoencoder (VAE) encodes videos into a low-dimensional latent space, becoming a key component of most Latent Video Diffusion Models (LVDMs) to reduce model training costs. However, as the resolution and duration of generated videos increase, the encoding cost of Video VAEs becomes a limiting bottleneck in training LVDMs. Moreover, the block-wise inference method adopted by most LVDMs can lead to discontinuities of latent space when processing long-duration videos. The key to addressing the computational bottleneck lies in decomposing videos into distinct components and efficiently encoding the critical information. Wavelet transform can decompose videos into multiple frequency-domain components and improve the efficiency significantly, we thus propose Wavelet Flow VAE (WF-VAE), an autoencoder that leverages multi-level wavelet transform to facilitate low-frequency energy flow into latent representation. Furthermore, we introduce a method called Causal Cache, which maintains the integrity of latent space during block-wise inference. Compared to state-of-the-art video VAEs, WF-VAE demonstrates superior performance in both PSNR and LPIPS metrics, achieving 2x higher throughput and 4x lower memory consumption while maintaining competitive reconstruction quality. Our code and models are available at https://github.com/PKU-YuanGroup/WF-VAE. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17459v1-abstract-full').style.display = 'none'; document.getElementById('2411.17459v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17440">arXiv:2411.17440</a> <span> [<a href="https://arxiv.org/pdf/2411.17440">pdf</a>, <a href="https://arxiv.org/format/2411.17440">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multimedia">cs.MM</span> </div> </div> <p class="title is-5 mathjax"> Identity-Preserving Text-to-Video Generation by Frequency Decomposition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Shenghai Yuan</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jinfa Huang</a>, <a href="/search/cs?searchtype=author&query=He%2C+X">Xianyi He</a>, <a href="/search/cs?searchtype=author&query=Ge%2C+Y">Yunyuan Ge</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Y">Yujun Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liuhan Chen</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+J">Jiebo Luo</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+L">Li Yuan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17440v1-abstract-short" style="display: inline;"> Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in literature: (1) A tuning-free pipeline without tedious case-by-case finetuning, and (… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17440v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17440v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17440v1-abstract-full" style="display: none;"> Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in literature: (1) A tuning-free pipeline without tedious case-by-case finetuning, and (2) A frequency-aware heuristic identity-preserving DiT-based control scheme. We propose ConsisID, a tuning-free DiT-based controllable IPT2V model to keep human identity consistent in the generated video. Inspired by prior findings in frequency analysis of diffusion transformers, it employs identity-control signals in the frequency domain, where facial features can be decomposed into low-frequency global features and high-frequency intrinsic features. First, from a low-frequency perspective, we introduce a global facial extractor, which encodes reference images and facial key points into a latent space, generating features enriched with low-frequency information. These features are then integrated into shallow layers of the network to alleviate training challenges associated with DiT. Second, from a high-frequency perspective, we design a local facial extractor to capture high-frequency details and inject them into transformer blocks, enhancing the model's ability to preserve fine-grained features. We propose a hierarchical training strategy to leverage frequency information for identity preservation, transforming a vanilla pre-trained video generation model into an IPT2V model. Extensive experiments demonstrate that our frequency-aware heuristic scheme provides an optimal control solution for DiT-based models. Thanks to this scheme, our ConsisID generates high-quality, identity-preserving videos, making strides towards more effective IPT2V. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17440v1-abstract-full').style.display = 'none'; document.getElementById('2411.17440v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">12 pages, 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.17230">arXiv:2411.17230</a> <span> [<a href="https://arxiv.org/pdf/2411.17230">pdf</a>, <a href="https://arxiv.org/format/2411.17230">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Software Engineering">cs.SE</span> </div> </div> <p class="title is-5 mathjax"> Fault Localization from the Semantic Code Search Perspective </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Qin%2C+Y">Yihao Qin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+S">Shangwen Wang</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+Y">Yan Lei</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuo Zhang</a>, <a href="/search/cs?searchtype=author&query=Lin%2C+B">Bo Lin</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xin Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liqian Chen</a>, <a href="/search/cs?searchtype=author&query=Mao%2C+X">Xiaoguang Mao</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.17230v1-abstract-short" style="display: inline;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'inline'; document.getElementById('2411.17230v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.17230v1-abstract-full" style="display: none;"> The software development process is characterized by an iterative cycle of continuous functionality implementation and debugging, essential for the enhancement of software quality and adaptability to changing requirements. This process incorporates two isolatedly studied tasks: Code Search (CS), which retrieves reference code from a code corpus to aid in code implementation, and Fault Localization (FL), which identifies code entities responsible for bugs within the software project to boost software debugging. These two tasks exhibit similarities since they both address search problems. Notably, CS techniques have demonstrated greater effectiveness than FL ones, possibly because of the precise semantic details of the required code offered by natural language queries, which are not readily accessible to FL methods. Drawing inspiration from this, we hypothesize that a fault localizer could achieve greater proficiency if semantic information about the buggy methods were made available. Based on this idea, we propose CosFL, an FL approach that decomposes the FL task into two steps: query generation, which describes the functionality of the problematic code in natural language, and fault retrieval, which uses CS to find program elements semantically related to the query. Specifically, to depict the buggy functionalities and generate high-quality queries, CosFL extensively harnesses the code analysis, semantic comprehension, and decision-making capabilities of LLMs. Moreover, to enhance the accuracy of CS, CosFL captures varying levels of context information and employs a multi-granularity code search strategy, which facilitates a more precise identification of buggy methods from a holistic view. The evaluation on 835 real bugs from 23 Java projects shows that CosFL successfully localizes 324 bugs within Top-1, which significantly outperforms the state-of-the-art approaches by 26.6%-57.3%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.17230v1-abstract-full').style.display = 'none'; document.getElementById('2411.17230v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16964">arXiv:2411.16964</a> <span> [<a href="https://arxiv.org/pdf/2411.16964">pdf</a>, <a href="https://arxiv.org/format/2411.16964">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> </div> </div> <p class="title is-5 mathjax"> MotionWavelet: Human Motion Prediction via Wavelet Manifold Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Feng%2C+Y">Yuming Feng</a>, <a href="/search/cs?searchtype=author&query=Dou%2C+Z">Zhiyang Dou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling-Hao Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuan Liu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+T">Tianyu Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+J">Jingbo Wang</a>, <a href="/search/cs?searchtype=author&query=Cao%2C+Z">Zeyu Cao</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+W">Wenping Wang</a>, <a href="/search/cs?searchtype=author&query=Komura%2C+T">Taku Komura</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+L">Lingjie Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16964v1-abstract-short" style="display: inline;"> Modeling temporal characteristics and the non-stationary dynamics of body movement plays a significant role in predicting human future motions. However, it is challenging to capture these features due to the subtle transitions involved in the complex human motions. This paper introduces MotionWavelet, a human motion prediction framework that utilizes Wavelet Transformation and studies human motion… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16964v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16964v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16964v1-abstract-full" style="display: none;"> Modeling temporal characteristics and the non-stationary dynamics of body movement plays a significant role in predicting human future motions. However, it is challenging to capture these features due to the subtle transitions involved in the complex human motions. This paper introduces MotionWavelet, a human motion prediction framework that utilizes Wavelet Transformation and studies human motion patterns in the spatial-frequency domain. In MotionWavelet, a Wavelet Diffusion Model (WDM) learns a Wavelet Manifold by applying Wavelet Transformation on the motion data therefore encoding the intricate spatial and temporal motion patterns. Once the Wavelet Manifold is built, WDM trains a diffusion model to generate human motions from Wavelet latent vectors. In addition to the WDM, MotionWavelet also presents a Wavelet Space Shaping Guidance mechanism to refine the denoising process to improve conformity with the manifold structure. WDM also develops Temporal Attention-Based Guidance to enhance prediction accuracy. Extensive experiments validate the effectiveness of MotionWavelet, demonstrating improved prediction accuracy and enhanced generalization across various benchmarks. Our code and models will be released upon acceptance. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16964v1-abstract-full').style.display = 'none'; document.getElementById('2411.16964v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Project Page: https://frank-zy-dou.github.io/projects/MotionWavelet/ Video: https://youtu.be/pyWq0OYJdI0?si=4YHfFNXmLnbPC39g</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16375">arXiv:2411.16375</a> <span> [<a href="https://arxiv.org/pdf/2411.16375">pdf</a>, <a href="https://arxiv.org/format/2411.16375">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Ca2-VDM: Efficient Autoregressive Video Diffusion Model with Causal Generation and Cache Sharing </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gao%2C+K">Kaifeng Gao</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+J">Jiaxin Shi</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+H">Hanwang Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chunping Wang</a>, <a href="/search/cs?searchtype=author&query=Xiao%2C+J">Jun Xiao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16375v1-abstract-short" style="display: inline;"> With the advance of diffusion models, today's video generation has achieved impressive quality. To extend the generation length and facilitate real-world applications, a majority of video diffusion models (VDMs) generate videos in an autoregressive manner, i.e., generating subsequent clips conditioned on the last frame(s) of the previous clip. However, existing autoregressive VDMs are highly ineff… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16375v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16375v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16375v1-abstract-full" style="display: none;"> With the advance of diffusion models, today's video generation has achieved impressive quality. To extend the generation length and facilitate real-world applications, a majority of video diffusion models (VDMs) generate videos in an autoregressive manner, i.e., generating subsequent clips conditioned on the last frame(s) of the previous clip. However, existing autoregressive VDMs are highly inefficient and redundant: The model must re-compute all the conditional frames that are overlapped between adjacent clips. This issue is exacerbated when the conditional frames are extended autoregressively to provide the model with long-term context. In such cases, the computational demands increase significantly (i.e., with a quadratic complexity w.r.t. the autoregression step). In this paper, we propose Ca2-VDM, an efficient autoregressive VDM with Causal generation and Cache sharing. For causal generation, it introduces unidirectional feature computation, which ensures that the cache of conditional frames can be precomputed in previous autoregression steps and reused in every subsequent step, eliminating redundant computations. For cache sharing, it shares the cache across all denoising steps to avoid the huge cache storage cost. Extensive experiments demonstrated that our Ca2-VDM achieves state-of-the-art quantitative and qualitative video generation results and significantly improves the generation speed. Code is available at https://github.com/Dawn-LX/CausalCache-VDM <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16375v1-abstract-full').style.display = 'none'; document.getElementById('2411.16375v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 25 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Technical Report. Code is available at https://github.com/Dawn-LX/CausalCache-VDM</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16085">arXiv:2411.16085</a> <span> [<a href="https://arxiv.org/pdf/2411.16085">pdf</a>, <a href="https://arxiv.org/format/2411.16085">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Discrete Mathematics">cs.DM</span> </div> </div> <p class="title is-5 mathjax"> Cautious Optimizers: Improving Training with One Line of Code </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liang%2C+K">Kaizhao Liang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lizhang Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bo Liu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Q">Qiang Liu</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16085v1-abstract-short" style="display: inline;"> AdamW has been the default optimizer for transformer pretraining. For many years, our community searches for faster and more stable optimizers with only constraint positive outcomes. In this work, we propose a \textbf{single-line modification in Pytorch} to any momentum-based optimizer, which we rename Cautious Optimizer, e.g. C-AdamW and C-Lion. Our theoretical result shows that this modification… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16085v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16085v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16085v1-abstract-full" style="display: none;"> AdamW has been the default optimizer for transformer pretraining. For many years, our community searches for faster and more stable optimizers with only constraint positive outcomes. In this work, we propose a \textbf{single-line modification in Pytorch} to any momentum-based optimizer, which we rename Cautious Optimizer, e.g. C-AdamW and C-Lion. Our theoretical result shows that this modification preserves Adam's Hamiltonian function and it does not break the convergence guarantee under the Lyapunov analysis. In addition, a whole new family of optimizers is revealed by our theoretical insight. Among them, we pick the simplest one for empirical experiments, showing speed-up on Llama and MAE pretraining up to $1.47\times$. Code is available at https://github.com/kyleliang919/C-Optim <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16085v1-abstract-full').style.display = 'none'; document.getElementById('2411.16085v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16077">arXiv:2411.16077</a> <span> [<a href="https://arxiv.org/pdf/2411.16077">pdf</a>, <a href="https://arxiv.org/format/2411.16077">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Multiagent Systems">cs.MA</span> </div> </div> <p class="title is-5 mathjax"> SAGEval: The frontiers of Satisfactory Agent based NLG Evaluation for reference-free open-ended text </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Ghosh%2C+R">Reshmi Ghosh</a>, <a href="/search/cs?searchtype=author&query=Yao%2C+T">Tianyi Yao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lizzy Chen</a>, <a href="/search/cs?searchtype=author&query=Hasan%2C+S">Sadid Hasan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+T">Tianwei Chen</a>, <a href="/search/cs?searchtype=author&query=Bernal%2C+D">Dario Bernal</a>, <a href="/search/cs?searchtype=author&query=Jiao%2C+H">Huitian Jiao</a>, <a href="/search/cs?searchtype=author&query=Hossain%2C+H+M+S">H M Sajjad Hossain</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16077v1-abstract-short" style="display: inline;"> Large Language Model (LLM) integrations into applications like Microsoft365 suite and Google Workspace for creating/processing documents, emails, presentations, etc. has led to considerable enhancements in productivity and time savings. But as these integrations become more more complex, it is paramount to ensure that the quality of output from the LLM-integrated applications are relevant and appr… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16077v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16077v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16077v1-abstract-full" style="display: none;"> Large Language Model (LLM) integrations into applications like Microsoft365 suite and Google Workspace for creating/processing documents, emails, presentations, etc. has led to considerable enhancements in productivity and time savings. But as these integrations become more more complex, it is paramount to ensure that the quality of output from the LLM-integrated applications are relevant and appropriate for use. Identifying the need to develop robust evaluation approaches for natural language generation, wherein references/ground labels doesn't exist or isn't amply available, this paper introduces a novel framework called "SAGEval" which utilizes a critiquing Agent to provide feedback on scores generated by LLM evaluators. We show that the critiquing Agent is able to rectify scores from LLM evaluators, in absence of references/ground-truth labels, thereby reducing the need for labeled data even for complex NLG evaluation scenarios, like the generation of JSON-structured forms/surveys with responses in different styles like multiple choice, likert ratings, single choice questions, etc. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16077v1-abstract-full').style.display = 'none'; document.getElementById('2411.16077v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.16007">arXiv:2411.16007</a> <span> [<a href="https://arxiv.org/pdf/2411.16007">pdf</a>, <a href="https://arxiv.org/format/2411.16007">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Hardware Architecture">cs.AR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Distributed, Parallel, and Cluster Computing">cs.DC</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Performance">cs.PF</span> </div> </div> <p class="title is-5 mathjax"> Performance Implications of Multi-Chiplet Neural Processing Units on Autonomous Driving Perception </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Odema%2C+M">Mohanad Odema</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Luke Chen</a>, <a href="/search/cs?searchtype=author&query=Kwon%2C+H">Hyoukjun Kwon</a>, <a href="/search/cs?searchtype=author&query=Faruque%2C+M+A+A">Mohammad Abdullah Al Faruque</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.16007v1-abstract-short" style="display: inline;"> We study the application of emerging chiplet-based Neural Processing Units to accelerate vehicular AI perception workloads in constrained automotive settings. The motivation stems from how chiplets technology is becoming integral to emerging vehicular architectures, providing a cost-effective trade-off between performance, modularity, and customization; and from perception models being the most co… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16007v1-abstract-full').style.display = 'inline'; document.getElementById('2411.16007v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.16007v1-abstract-full" style="display: none;"> We study the application of emerging chiplet-based Neural Processing Units to accelerate vehicular AI perception workloads in constrained automotive settings. The motivation stems from how chiplets technology is becoming integral to emerging vehicular architectures, providing a cost-effective trade-off between performance, modularity, and customization; and from perception models being the most computationally demanding workloads in a autonomous driving system. Using the Tesla Autopilot perception pipeline as a case study, we first breakdown its constituent models and profile their performance on different chiplet accelerators. From the insights, we propose a novel scheduling strategy to efficiently deploy perception workloads on multi-chip AI accelerators. Our experiments using a standard DNN performance simulator, MAESTRO, show our approach realizes 82% and 2.8x increase in throughput and processing engines utilization compared to monolithic accelerator designs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.16007v1-abstract-full').style.display = 'none'; document.getElementById('2411.16007v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 24 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">DATE'2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15519">arXiv:2411.15519</a> <span> [<a href="https://arxiv.org/pdf/2411.15519">pdf</a>, <a href="https://arxiv.org/format/2411.15519">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Risk Management">q-fin.RM</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> Risk Management with Feature-Enriched Generative Adversarial Networks (FE-GAN) </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15519v1-abstract-short" style="display: inline;"> This paper investigates the application of Feature-Enriched Generative Adversarial Networks (FE-GAN) in financial risk management, with a focus on improving the estimation of Value at Risk (VaR) and Expected Shortfall (ES). FE-GAN enhances existing GANs architectures by incorporating an additional input sequence derived from preceding data to improve model performance. Two specialized GANs models,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15519v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15519v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15519v1-abstract-full" style="display: none;"> This paper investigates the application of Feature-Enriched Generative Adversarial Networks (FE-GAN) in financial risk management, with a focus on improving the estimation of Value at Risk (VaR) and Expected Shortfall (ES). FE-GAN enhances existing GANs architectures by incorporating an additional input sequence derived from preceding data to improve model performance. Two specialized GANs models, the Wasserstein Generative Adversarial Network (WGAN) and the Tail Generative Adversarial Network (Tail-GAN), were evaluated under the FE-GAN framework. The results demonstrate that FE-GAN significantly outperforms traditional architectures in both VaR and ES estimation. Tail-GAN, leveraging its task-specific loss function, consistently outperforms WGAN in ES estimation, while both models exhibit similar performance in VaR estimation. Despite these promising results, the study acknowledges limitations, including reliance on highly correlated temporal data and restricted applicability to other domains. Future research directions include exploring alternative input generation methods, dynamic forecasting models, and advanced neural network architectures to further enhance GANs-based financial risk estimation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15519v1-abstract-full').style.display = 'none'; document.getElementById('2411.15519v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 23 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15231">arXiv:2411.15231</a> <span> [<a href="https://arxiv.org/pdf/2411.15231">pdf</a>, <a href="https://arxiv.org/format/2411.15231">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> IterIS: Iterative Inference-Solving Alignment for LoRA Merging </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+H">Hongxu Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+R">Runshi Li</a>, <a href="/search/cs?searchtype=author&query=Zhu%2C+B">Bowei Zhu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zhen Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15231v1-abstract-short" style="display: inline;"> Low-rank adaptations (LoRA) are widely used to fine-tune large models across various domains for specific downstream tasks. While task-specific LoRAs are often available, concerns about data privacy and intellectual property can restrict access to training data, limiting the acquisition of a multi-task model through gradient-based training. In response, LoRA merging presents an effective solution… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15231v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15231v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15231v1-abstract-full" style="display: none;"> Low-rank adaptations (LoRA) are widely used to fine-tune large models across various domains for specific downstream tasks. While task-specific LoRAs are often available, concerns about data privacy and intellectual property can restrict access to training data, limiting the acquisition of a multi-task model through gradient-based training. In response, LoRA merging presents an effective solution by combining multiple LoRAs into a unified adapter while maintaining data privacy. Prior works on LoRA merging primarily frame it as an optimization problem, yet these approaches face several limitations, including the rough assumption about input features utilized in optimization, massive sample requirements, and the unbalanced optimization objective. These limitations can significantly degrade performance. To address these, we propose a novel optimization-based method, named IterIS: 1) We formulate LoRA merging as an advanced optimization problem to mitigate the rough assumption. Additionally, we employ an iterative inference-solving framework in our algorithm. It can progressively refine the optimization objective for improved performance. 2) We introduce an efficient regularization term to reduce the need for massive sample requirements (requiring only 1-5% of the unlabeled samples compared to prior methods). 3) We utilize adaptive weights in the optimization objective to mitigate potential unbalances in LoRA merging process. Our method demonstrates significant improvements over multiple baselines and state-of-the-art methods in composing tasks for text-to-image diffusion, vision-language models, and large language models. Furthermore, our layer-wise algorithm can achieve convergence with minimal steps, ensuring efficiency in both memory and computation. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15231v1-abstract-full').style.display = 'none'; document.getElementById('2411.15231v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.15096">arXiv:2411.15096</a> <span> [<a href="https://arxiv.org/pdf/2411.15096">pdf</a>, <a href="https://arxiv.org/format/2411.15096">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> RED: Effective Trajectory Representation Learning with Comprehensive Information </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Silin Zhou</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+S">Shuo Shang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lisi Chen</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a>, <a href="/search/cs?searchtype=author&query=Kalnis%2C+P">Panos Kalnis</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.15096v1-abstract-short" style="display: inline;"> Trajectory representation learning (TRL) maps trajectories to vectors that can then be used for various downstream tasks, including trajectory similarity computation, trajectory classification, and travel-time estimation. However, existing TRL methods often produce vectors that, when used in downstream tasks, yield insufficiently accurate results. A key reason is that they fail to utilize the comp… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15096v1-abstract-full').style.display = 'inline'; document.getElementById('2411.15096v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.15096v1-abstract-full" style="display: none;"> Trajectory representation learning (TRL) maps trajectories to vectors that can then be used for various downstream tasks, including trajectory similarity computation, trajectory classification, and travel-time estimation. However, existing TRL methods often produce vectors that, when used in downstream tasks, yield insufficiently accurate results. A key reason is that they fail to utilize the comprehensive information encompassed by trajectories. We propose a self-supervised TRL framework, called RED, which effectively exploits multiple types of trajectory information. Overall, RED adopts the Transformer as the backbone model and masks the constituting paths in trajectories to train a masked autoencoder (MAE). In particular, RED considers the moving patterns of trajectories by employing a Road-aware masking strategy} that retains key paths of trajectories during masking, thereby preserving crucial information of the trajectories. RED also adopts a spatial-temporal-user joint Embedding scheme to encode comprehensive information when preparing the trajectories as model inputs. To conduct training, RED adopts Dual-objective task learning}: the Transformer encoder predicts the next segment in a trajectory, and the Transformer decoder reconstructs the entire trajectory. RED also considers the spatial-temporal correlations of trajectories by modifying the attention mechanism of the Transformer. We compare RED with 9 state-of-the-art TRL methods for 4 downstream tasks on 3 real-world datasets, finding that RED can usually improve the accuracy of the best-performing baseline by over 5%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.15096v1-abstract-full').style.display = 'none'; document.getElementById('2411.15096v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by VLDB2025</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14842">arXiv:2411.14842</a> <span> [<a href="https://arxiv.org/pdf/2411.14842">pdf</a>, <a href="https://arxiv.org/format/2411.14842">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Sound">cs.SD</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Audio and Speech Processing">eess.AS</span> </div> </div> <p class="title is-5 mathjax"> Who Can Withstand Chat-Audio Attacks? An Evaluation Benchmark for Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wanqi Yang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+T">Tianyi Zhou</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14842v1-abstract-short" style="display: inline;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchma… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14842v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14842v1-abstract-full" style="display: none;"> Adversarial audio attacks pose a significant threat to the growing use of large language models (LLMs) in voice-based human-machine interactions. While existing research has primarily focused on model-specific adversarial methods, real-world applications demand a more generalizable and universal approach to audio adversarial attacks. In this paper, we introduce the Chat-Audio Attacks (CAA) benchmark including four distinct types of audio attacks, which aims to explore the the vulnerabilities of LLMs to these audio attacks in conversational scenarios. To evaluate the robustness of LLMs, we propose three evaluation strategies: Standard Evaluation, utilizing traditional metrics to quantify model performance under attacks; GPT-4o-Based Evaluation, which simulates real-world conversational complexities; and Human Evaluation, offering insights into user perception and trust. We evaluate six state-of-the-art LLMs with voice interaction capabilities, including Gemini-1.5-Pro, GPT-4o, and others, using three distinct evaluation methods on the CAA benchmark. Our comprehensive analysis reveals the impact of four types of audio attacks on the performance of these models, demonstrating that GPT-4o exhibits the highest level of resilience. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14842v1-abstract-full').style.display = 'none'; document.getElementById('2411.14842v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14768">arXiv:2411.14768</a> <span> [<a href="https://arxiv.org/pdf/2411.14768">pdf</a>, <a href="https://arxiv.org/format/2411.14768">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Grid and Road Expressions Are Complementary for Trajectory Representation Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhou%2C+S">Silin Zhou</a>, <a href="/search/cs?searchtype=author&query=Shang%2C+S">Shuo Shang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lisi Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+P">Peng Han</a>, <a href="/search/cs?searchtype=author&query=Jensen%2C+C+S">Christian S. Jensen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14768v1-abstract-short" style="display: inline;"> Trajectory representation learning (TRL) maps trajectories to vectors that can be used for many downstream tasks. Existing TRL methods use either grid trajectories, capturing movement in free space, or road trajectories, capturing movement in a road network, as input. We observe that the two types of trajectories are complementary, providing either region and location information or providing road… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14768v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14768v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14768v1-abstract-full" style="display: none;"> Trajectory representation learning (TRL) maps trajectories to vectors that can be used for many downstream tasks. Existing TRL methods use either grid trajectories, capturing movement in free space, or road trajectories, capturing movement in a road network, as input. We observe that the two types of trajectories are complementary, providing either region and location information or providing road structure and movement regularity. Therefore, we propose a novel multimodal TRL method, dubbed GREEN, to jointly utilize Grid and Road trajectory Expressions for Effective representatioN learning. In particular, we transform raw GPS trajectories into both grid and road trajectories and tailor two encoders to capture their respective information. To align the two encoders such that they complement each other, we adopt a contrastive loss to encourage them to produce similar embeddings for the same raw trajectory and design a mask language model (MLM) loss to use grid trajectories to help reconstruct masked road trajectories. To learn the final trajectory representation, a dual-modal interactor is used to fuse the outputs of the two encoders via cross-attention. We compare GREEN with 7 state-of-the-art TRL methods for 3 downstream tasks, finding that GREEN consistently outperforms all baselines and improves the accuracy of the best-performing baseline by an average of 15.99\%. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14768v1-abstract-full').style.display = 'none'; document.getElementById('2411.14768v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 22 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">This paper is accepted by KDD2025(August Cycle)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14456">arXiv:2411.14456</a> <span> [<a href="https://arxiv.org/pdf/2411.14456">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Can Artificial Intelligence Generate Quality Research Topics Reflecting Patient Concerns? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Kim%2C+J">Jiyeong Kim</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M+L">Michael L. Chen</a>, <a href="/search/cs?searchtype=author&query=Rezaei%2C+S+J">Shawheen J. Rezaei</a>, <a href="/search/cs?searchtype=author&query=Ramirez-Posada%2C+M">Mariana Ramirez-Posada</a>, <a href="/search/cs?searchtype=author&query=Caswell-Jin%2C+J+L">Jennifer L. Caswell-Jin</a>, <a href="/search/cs?searchtype=author&query=Kurian%2C+A+W">Allison W. Kurian</a>, <a href="/search/cs?searchtype=author&query=Riaz%2C+F">Fauzia Riaz</a>, <a href="/search/cs?searchtype=author&query=Sarin%2C+K+Y">Kavita Y. Sarin</a>, <a href="/search/cs?searchtype=author&query=Tang%2C+J+Y">Jean Y. Tang</a>, <a href="/search/cs?searchtype=author&query=Asch%2C+S+M">Steven M. Asch</a>, <a href="/search/cs?searchtype=author&query=Linos%2C+E">Eleni Linos</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14456v1-abstract-short" style="display: inline;"> Patient-centered research is increasingly important in narrowing the gap between research and patient care, yet incorporating patient perspectives into health research has been inconsistent. We propose an automated framework leveraging innovative natural language processing (NLP) and artificial intelligence (AI) with patient portal messages to generate research ideas that prioritize important pati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14456v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14456v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14456v1-abstract-full" style="display: none;"> Patient-centered research is increasingly important in narrowing the gap between research and patient care, yet incorporating patient perspectives into health research has been inconsistent. We propose an automated framework leveraging innovative natural language processing (NLP) and artificial intelligence (AI) with patient portal messages to generate research ideas that prioritize important patient issues. We further quantified the quality of AI-generated research topics. To define patient clinical concerns, we analyzed 614,464 patient messages from 25,549 individuals with breast or skin cancer obtained from a large academic hospital (2013 to 2024), constructing a 2-staged unsupervised NLP topic model. Then, we generated research topics to resolve the defined issues using a widely used AI (ChatGPT-4o, OpenAI Inc, April 2024 version) with prompt-engineering strategies. We guided AI to perform multi-level tasks: 1) knowledge interpretation and summarization (e.g., interpreting and summarizing the NLP-defined topics), 2) knowledge generation (e.g., generating research ideas corresponding to patients issues), 3) self-reflection and correction (e.g., ensuring and revising the research ideas after searching for scientific articles), and 4) self-reassurance (e.g., confirming and finalizing the research ideas). Six highly experienced breast oncologists and dermatologists assessed the significance and novelty of AI-generated research topics using a 5-point Likert scale (1-exceptional, 5-poor). One-third of the AI-suggested research topics were highly significant and novel when both scores were lower than the average. Two-thirds of the AI-suggested topics were novel in both cancers. Our findings demonstrate that AI-generated research topics reflecting patient perspectives via a large volume of patient messages can meaningfully guide future directions in patient-centered health research. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14456v1-abstract-full').style.display = 'none'; document.getElementById('2411.14456v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 15 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14279">arXiv:2411.14279</a> <span> [<a href="https://arxiv.org/pdf/2411.14279">pdf</a>, <a href="https://arxiv.org/format/2411.14279">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Looking Beyond Text: Reducing Language bias in Large Vision-Language Models via Multimodal Dual-Attention and Soft-Image Guidance </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zhao%2C+H">Haozhe Zhao</a>, <a href="/search/cs?searchtype=author&query=Si%2C+S">Shuzheng Si</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yichi Zhang</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+M">Maosong Sun</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+M">Mingjia Zhang</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+B">Baobao Chang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14279v1-abstract-short" style="display: inline;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14279v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14279v1-abstract-full" style="display: none;"> Large vision-language models (LVLMs) have achieved impressive results in various vision-language tasks. However, despite showing promising performance, LVLMs suffer from hallucinations caused by language bias, leading to diminished focus on images and ineffective visual comprehension. We identify two primary reasons for this bias: 1. Different scales of training data between the pretraining stage of LLM and multimodal alignment stage. 2. The learned inference bias due to short-term dependency of text data. Therefore, we propose LACING, a systemic framework designed to address the language bias of LVLMs with muLtimodal duAl-attention meChanIsm (MDA) aNd soft-image Guidance (IFG). Specifically, MDA introduces a parallel dual-attention mechanism that enhances the integration of visual inputs across the model. IFG introduces a learnable soft visual prompt during training and inference to replace visual inputs, designed to compel LVLMs to prioritize text inputs. Then, IFG further proposes a novel decoding strategy using the soft visual prompt to mitigate the model's over-reliance on adjacent text inputs. Comprehensive experiments demonstrate that our method effectively debiases LVLMs from their language bias, enhancing visual comprehension and reducing hallucinations without requiring additional training resources or data. The code and model are available at [lacing-lvlm.github.io](https://lacing-lvlm.github.io). <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14279v1-abstract-full').style.display = 'none'; document.getElementById('2411.14279v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">19 pages, 12 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.14053">arXiv:2411.14053</a> <span> [<a href="https://arxiv.org/pdf/2411.14053">pdf</a>, <a href="https://arxiv.org/format/2411.14053">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Stereo Anything: Unifying Stereo Matching with Large-Scale Mixed Data </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Youmin Zhang</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+D">Dujun Nie</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+R">Ruilin Wang</a>, <a href="/search/cs?searchtype=author&query=Zheng%2C+W">Wenzhao Zheng</a>, <a href="/search/cs?searchtype=author&query=Poggi%2C+M">Matteo Poggi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.14053v1-abstract-short" style="display: inline;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diver… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'inline'; document.getElementById('2411.14053v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.14053v1-abstract-full" style="display: none;"> Stereo matching has been a pivotal component in 3D vision, aiming to find corresponding points between pairs of stereo images to recover depth information. In this work, we introduce StereoAnything, a highly practical solution for robust stereo matching. Rather than focusing on a specialized model, our goal is to develop a versatile foundational model capable of handling stereo images across diverse environments. To this end, we scale up the dataset by collecting labeled stereo images and generating synthetic stereo pairs from unlabeled monocular images. To further enrich the model's ability to generalize across different conditions, we introduce a novel synthetic dataset that complements existing data by adding variability in baselines, camera angles, and scene types. We extensively evaluate the zero-shot capabilities of our model on five public datasets, showcasing its impressive ability to generalize to new, unseen data. Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.14053v1-abstract-full').style.display = 'none'; document.getElementById('2411.14053v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 21 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/OpenStereo}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13765">arXiv:2411.13765</a> <span> [<a href="https://arxiv.org/pdf/2411.13765">pdf</a>, <a href="https://arxiv.org/ps/2411.13765">ps</a>, <a href="https://arxiv.org/format/2411.13765">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Theory">cs.IT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Optimization and Control">math.OC</span> </div> </div> <p class="title is-5 mathjax"> Schr枚dinger Bridge Problem for Jump Diffusions </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zlotchevski%2C+A">Andrei Zlotchevski</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Linan Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13765v1-abstract-short" style="display: inline;"> The Schr枚dinger bridge problem (SBP) seeks to find the measure $\hat{\mathbf{P}}$ on a certain path space which interpolates between state-space distributions $蟻_0$ at time $0$ and $蟻_T$ at time $T$ while minimizing the KL divergence (relative entropy) to a reference path measure $\mathbf{R}$. In this work, we tackle the SBP in the case when $\mathbf{R}$ is the path measure of a jump diffusion. Un… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13765v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13765v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13765v1-abstract-full" style="display: none;"> The Schr枚dinger bridge problem (SBP) seeks to find the measure $\hat{\mathbf{P}}$ on a certain path space which interpolates between state-space distributions $蟻_0$ at time $0$ and $蟻_T$ at time $T$ while minimizing the KL divergence (relative entropy) to a reference path measure $\mathbf{R}$. In this work, we tackle the SBP in the case when $\mathbf{R}$ is the path measure of a jump diffusion. Under mild assumptions, with both the operator theory approach and the stochastic calculus techniques, we establish an $h$-transform theory for jump diffusions and devise an approximation method to achieve the jump-diffusion SBP solution $\hat{\mathbf{P}}$ as the strong-convergence limit of a sequence of harmonic $h$-transforms. To the best of our knowledge, these results are novel in the study of SBP. Moreover, the $h$-transform framework and the approximation method developed in this work are robust and applicable to a relatively general class of jump diffusions. In addition, we examine the SBP of particular types of jump diffusions under additional regularity conditions and extend the existing results on the SBP from the diffusion case to the jump-diffusion setting. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13765v1-abstract-full').style.display = 'none'; document.getElementById('2411.13765v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">MSC Class:</span> 35Q93; 45K05; 60H10; 60H20; 60H30; 94A17 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13599">arXiv:2411.13599</a> <span> [<a href="https://arxiv.org/pdf/2411.13599">pdf</a>, <a href="https://arxiv.org/format/2411.13599">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistical Finance">q-fin.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Can ChatGPT Overcome Behavioral Biases in the Financial Sector? Classify-and-Rethink: Multi-Step Zero-Shot Reasoning in the Gold Investment </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuoling Liu</a>, <a href="/search/cs?searchtype=author&query=Jia%2C+G">Gaoguo Jia</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+Y">Yuhang Jiang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyuan Chen</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+Q">Qiang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13599v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have achieved remarkable success recently, displaying exceptional capabilities in creating understandable and organized text. These LLMs have been utilized in diverse fields, such as clinical research, where domain-specific models like Med-Palm have achieved human-level performance. Recently, researchers have employed advanced prompt engineering to enhance the general… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13599v1-abstract-full').style.display = 'inline'; document.getElementById('2411.13599v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13599v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have achieved remarkable success recently, displaying exceptional capabilities in creating understandable and organized text. These LLMs have been utilized in diverse fields, such as clinical research, where domain-specific models like Med-Palm have achieved human-level performance. Recently, researchers have employed advanced prompt engineering to enhance the general reasoning ability of LLMs. Despite the remarkable success of zero-shot Chain-of-Thoughts (CoT) in solving general reasoning tasks, the potential of these methods still remains paid limited attention in the financial reasoning task.To address this issue, we explore multiple prompt strategies and incorporated semantic news information to improve LLMs' performance on financial reasoning tasks.To the best of our knowledge, we are the first to explore this important issue by applying ChatGPT to the gold investment.In this work, our aim is to investigate the financial reasoning capabilities of LLMs and their capacity to generate logical and persuasive investment opinions. We will use ChatGPT, one of the most powerful LLMs recently, and prompt engineering to achieve this goal. Our research will focus on understanding the ability of LLMs in sophisticated analysis and reasoning within the context of investment decision-making. Our study finds that ChatGPT with CoT prompt can provide more explainable predictions and overcome behavioral biases, which is crucial in finance-related tasks and can achieve higher investment returns. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13599v1-abstract-full').style.display = 'none'; document.getElementById('2411.13599v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.13112">arXiv:2411.13112</a> <span> [<a href="https://arxiv.org/pdf/2411.13112">pdf</a>, <a href="https://arxiv.org/format/2411.13112">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> DriveMLLM: A Benchmark for Spatial Understanding with Multimodal Large Language Models in Autonomous Driving </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Guo%2C+X">Xianda Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Ruijun Zhang</a>, <a href="/search/cs?searchtype=author&query=Duan%2C+Y">Yiqun Duan</a>, <a href="/search/cs?searchtype=author&query=He%2C+Y">Yuhang He</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+C">Chenming Zhang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+S">Shuai Liu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.13112v2-abstract-short" style="display: inline;"> Autonomous driving requires a comprehensive understanding of 3D environments to facilitate high-level tasks such as motion prediction, planning, and mapping. In this paper, we introduce DriveMLLM, a benchmark specifically designed to evaluate the spatial understanding capabilities of multimodal large language models (MLLMs) in autonomous driving. DriveMLLM includes 880 front-facing camera images a… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13112v2-abstract-full').style.display = 'inline'; document.getElementById('2411.13112v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.13112v2-abstract-full" style="display: none;"> Autonomous driving requires a comprehensive understanding of 3D environments to facilitate high-level tasks such as motion prediction, planning, and mapping. In this paper, we introduce DriveMLLM, a benchmark specifically designed to evaluate the spatial understanding capabilities of multimodal large language models (MLLMs) in autonomous driving. DriveMLLM includes 880 front-facing camera images and introduces both absolute and relative spatial reasoning tasks, accompanied by linguistically diverse natural language questions. To measure MLLMs' performance, we propose novel evaluation metrics focusing on spatial understanding. We evaluate several state-of-the-art MLLMs on DriveMLLM, and our results reveal the limitations of current models in understanding complex spatial relationships in driving contexts. We believe these findings underscore the need for more advanced MLLM-based spatial reasoning methods and highlight the potential for DriveMLLM to drive further research in autonomous driving. Code will be available at \url{https://github.com/XiandaGuo/Drive-MLLM}. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.13112v2-abstract-full').style.display = 'none'; document.getElementById('2411.13112v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 26 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 20 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Code will be available at \url{https://github.com/XiandaGuo/Drive-MLLM}</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12853">arXiv:2411.12853</a> <span> [<a href="https://arxiv.org/pdf/2411.12853">pdf</a>, <a href="https://arxiv.org/format/2411.12853">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Biomolecules">q-bio.BM</span> </div> </div> <p class="title is-5 mathjax"> Integrating Secondary Structures Information into Triangular Spatial Relationships (TSR) for Advanced Protein Classification </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khajouie%2C+P">Poorya Khajouie</a>, <a href="/search/cs?searchtype=author&query=Sarkar%2C+T">Titli Sarkar</a>, <a href="/search/cs?searchtype=author&query=Rauniyar%2C+K">Krishna Rauniyar</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+W">Wu Xu</a>, <a href="/search/cs?searchtype=author&query=Raghavan%2C+V">Vijay Raghavan</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12853v1-abstract-short" style="display: inline;"> Protein structures represent the key to deciphering biological functions. The more detailed form of similarity among these proteins is sometimes overlooked by the conventional structural comparison methods. In contrast, further advanced methods, such as Triangular Spatial Relationship (TSR), have been demonstrated to make finer differentiations. Still, the classical implementation of TSR does not… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12853v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12853v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12853v1-abstract-full" style="display: none;"> Protein structures represent the key to deciphering biological functions. The more detailed form of similarity among these proteins is sometimes overlooked by the conventional structural comparison methods. In contrast, further advanced methods, such as Triangular Spatial Relationship (TSR), have been demonstrated to make finer differentiations. Still, the classical implementation of TSR does not provide for the integration of secondary structure information, which is important for a more detailed understanding of the folding pattern of a protein. To overcome these limitations, we developed the SSE-TSR approach. The proposed method integrates secondary structure elements (SSEs) into TSR-based protein representations. This allows an enriched representation of protein structures by considering 18 different combinations of helix, strand, and coil arrangements. Our results show that using SSEs improves the accuracy and reliability of protein classification to varying degrees. We worked with two large protein datasets of 9.2K and 7.8K samples, respectively. We applied the SSE-TSR approach and used a neural network model for classification. Interestingly, introducing SSEs improved performance statistics for Dataset 1, with accuracy moving from 96.0% to 98.3%. For Dataset 2, where the performance statistics were already good, further small improvements were found with the introduction of SSE, giving an accuracy of 99.5% compared to 99.4%. These results show that SSE integration can dramatically improve TSR key discrimination, with significant benefits in datasets with low initial accuracies and only incremental gains in those with high baseline performance. Thus, SSE-TSR is a powerful bioinformatics tool that improves protein classification and understanding of protein function and interaction. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12853v1-abstract-full').style.display = 'none'; document.getElementById('2411.12853v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12840">arXiv:2411.12840</a> <span> [<a href="https://arxiv.org/pdf/2411.12840">pdf</a>, <a href="https://arxiv.org/format/2411.12840">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Statistics Theory">math.ST</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Logic in Computer Science">cs.LO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Category Theory">math.CT</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Probability">math.PR</span> </div> </div> <p class="title is-5 mathjax"> The Aldous--Hoover Theorem in Categorical Probability </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Leihao Chen</a>, <a href="/search/cs?searchtype=author&query=Fritz%2C+T">Tobias Fritz</a>, <a href="/search/cs?searchtype=author&query=Gonda%2C+T">Tom谩拧 Gonda</a>, <a href="/search/cs?searchtype=author&query=Klingler%2C+A">Andreas Klingler</a>, <a href="/search/cs?searchtype=author&query=Lorenzin%2C+A">Antonio Lorenzin</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12840v1-abstract-short" style="display: inline;"> The Aldous-Hoover Theorem concerns an infinite matrix of random variables whose distribution is invariant under finite permutations of rows and columns. It states that, up to equality in distribution, each random variable in the matrix can be expressed as a function only depending on four key variables: one common to the entire matrix, one that encodes information about its row, one that encodes i… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12840v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12840v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12840v1-abstract-full" style="display: none;"> The Aldous-Hoover Theorem concerns an infinite matrix of random variables whose distribution is invariant under finite permutations of rows and columns. It states that, up to equality in distribution, each random variable in the matrix can be expressed as a function only depending on four key variables: one common to the entire matrix, one that encodes information about its row, one that encodes information about its column, and a fourth one specific to the matrix entry. We state and prove the theorem within a category-theoretic approach to probability, namely the theory of Markov categories. This makes the proof more transparent and intuitive when compared to measure-theoretic ones. A key role is played by a newly identified categorical property, the Cauchy--Schwarz axiom, which also facilitates a new synthetic de Finetti Theorem. We further provide a variant of our proof using the ordered Markov property and the d-separation criterion, both generalized from Bayesian networks to Markov categories. We expect that this approach will facilitate a systematic development of more complex results in the future, such as categorical approaches to hierarchical exchangeability. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12840v1-abstract-full').style.display = 'none'; document.getElementById('2411.12840v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">39 pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12426">arXiv:2411.12426</a> <span> [<a href="https://arxiv.org/pdf/2411.12426">pdf</a>, <a href="https://arxiv.org/format/2411.12426">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Motif Channel Opened in a White-Box: Stereo Matching via Motif Correlation Graph </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Ziyang Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Y">Yongjun Zhang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+W">Wenting Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+B">Bingshu Wang</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Y">Yong Zhao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+L+P">C. L. Philip Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12426v1-abstract-short" style="display: inline;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of d… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12426v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12426v1-abstract-full" style="display: none;"> Real-world applications of stereo matching, such as autonomous driving, place stringent demands on both safety and accuracy. However, learning-based stereo matching methods inherently suffer from the loss of geometric structures in certain feature channels, creating a bottleneck in achieving precise detail matching. Additionally, these methods lack interpretability due to the black-box nature of deep learning. In this paper, we propose MoCha-V2, a novel learning-based paradigm for stereo matching. MoCha-V2 introduces the Motif Correlation Graph (MCG) to capture recurring textures, which are referred to as ``motifs" within feature channels. These motifs reconstruct geometric structures and are learned in a more interpretable way. Subsequently, we integrate features from multiple frequency domains through wavelet inverse transformation. The resulting motif features are utilized to restore geometric structures in the stereo matching process. Experimental results demonstrate the effectiveness of MoCha-V2. MoCha-V2 achieved 1st place on the Middlebury benchmark at the time of its release. Code is available at https://github.com/ZYangChen/MoCha-Stereo. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12426v1-abstract-full').style.display = 'none'; document.getElementById('2411.12426v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.12273">arXiv:2411.12273</a> <span> [<a href="https://arxiv.org/pdf/2411.12273">pdf</a>, <a href="https://arxiv.org/format/2411.12273">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Acquire Precise and Comparable Fundus Image Quality Score: FTHNet and FQS Dataset </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+Z">Zheng Gong</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+Z">Zhuo Deng</a>, <a href="/search/cs?searchtype=author&query=Gan%2C+R">Run Gan</a>, <a href="/search/cs?searchtype=author&query=Niu%2C+Z">Zhiyuan Niu</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lu Chen</a>, <a href="/search/cs?searchtype=author&query=Huang%2C+C">Canfeng Huang</a>, <a href="/search/cs?searchtype=author&query=Liang%2C+J">Jia Liang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+W">Weihao Gao</a>, <a href="/search/cs?searchtype=author&query=Li%2C+F">Fang Li</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+S">Shaochong Zhang</a>, <a href="/search/cs?searchtype=author&query=Ma%2C+L">Lan Ma</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.12273v1-abstract-short" style="display: inline;"> The retinal fundus images are utilized extensively in the diagnosis, and their quality can directly affect the diagnosis results. However, due to the insufficient dataset and algorithm application, current fundus image quality assessment (FIQA) methods are not powerful enough to meet ophthalmologists` demands. In this paper, we address the limitations of datasets and algorithms in FIQA. First, we… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12273v1-abstract-full').style.display = 'inline'; document.getElementById('2411.12273v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.12273v1-abstract-full" style="display: none;"> The retinal fundus images are utilized extensively in the diagnosis, and their quality can directly affect the diagnosis results. However, due to the insufficient dataset and algorithm application, current fundus image quality assessment (FIQA) methods are not powerful enough to meet ophthalmologists` demands. In this paper, we address the limitations of datasets and algorithms in FIQA. First, we establish a new FIQA dataset, Fundus Quality Score(FQS), which includes 2246 fundus images with two labels: a continuous Mean Opinion Score varying from 0 to 100 and a three-level quality label. Then, we propose a FIQA Transformer-based Hypernetwork (FTHNet) to solve these tasks with regression results rather than classification results in conventional FIQA works. The FTHNet is optimized for the FIQA tasks with extensive experiments. Results on our FQS dataset show that the FTHNet can give quality scores for fundus images with PLCC of 0.9423 and SRCC of 0.9488, significantly outperforming other methods with fewer parameters and less computation complexity.We successfully build a dataset and model addressing the problems of current FIQA methods. Furthermore, the model deployment experiments demonstrate its potential in automatic medical image quality control. All experiments are carried out with 10-fold cross-validation to ensure the significance of the results. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.12273v1-abstract-full').style.display = 'none'; document.getElementById('2411.12273v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 19 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">11 pages, 7 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11915">arXiv:2411.11915</a> <span> [<a href="https://arxiv.org/pdf/2411.11915">pdf</a>, <a href="https://arxiv.org/format/2411.11915">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Genomics">q-bio.GN</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Phenome-wide causal proteomics enhance systemic lupus erythematosus flare prediction: A study in Asian populations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liying Chen</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+O">Ou Deng</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+T">Ting Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+M">Mei Chen</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+X">Xvfeng Zhang</a>, <a href="/search/cs?searchtype=author&query=Cong%2C+R">Ruichen Cong</a>, <a href="/search/cs?searchtype=author&query=Lu%2C+D">Dingqi Lu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+R">Runrun Zhang</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Q">Qun Jin</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+X">Xinchang Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11915v1-abstract-short" style="display: inline;"> Objective: Systemic lupus erythematosus (SLE) is a complex autoimmune disease characterized by unpredictable flares. This study aimed to develop a novel proteomics-based risk prediction model specifically for Asian SLE populations to enhance personalized disease management and early intervention. Methods: A longitudinal cohort study was conducted over 48 weeks, including 139 SLE patients monitored… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11915v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11915v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11915v1-abstract-full" style="display: none;"> Objective: Systemic lupus erythematosus (SLE) is a complex autoimmune disease characterized by unpredictable flares. This study aimed to develop a novel proteomics-based risk prediction model specifically for Asian SLE populations to enhance personalized disease management and early intervention. Methods: A longitudinal cohort study was conducted over 48 weeks, including 139 SLE patients monitored every 12 weeks. Patients were classified into flare (n = 53) and non-flare (n = 86) groups. Baseline plasma samples underwent data-independent acquisition (DIA) proteomics analysis, and phenome-wide Mendelian randomization (PheWAS) was performed to evaluate causal relationships between proteins and clinical predictors. Logistic regression (LR) and random forest (RF) models were used to integrate proteomic and clinical data for flare risk prediction. Results: Five proteins (SAA1, B4GALT5, GIT2, NAA15, and RPIA) were significantly associated with SLE Disease Activity Index-2K (SLEDAI-2K) scores and 1-year flare risk, implicating key pathways such as B-cell receptor signaling and platelet degranulation. SAA1 demonstrated causal effects on flare-related clinical markers, including hemoglobin and red blood cell counts. A combined model integrating clinical and proteomic data achieved the highest predictive accuracy (AUC = 0.769), surpassing individual models. SAA1 was highlighted as a priority biomarker for rapid flare discrimination. Conclusion: The integration of proteomic and clinical data significantly improves flare prediction in Asian SLE patients. The identification of key proteins and their causal relationships with flare-related clinical markers provides valuable insights for proactive SLE management and personalized therapeutic approaches. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11915v1-abstract-full').style.display = 'none'; document.getElementById('2411.11915v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 17 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.11544">arXiv:2411.11544</a> <span> [<a href="https://arxiv.org/pdf/2411.11544">pdf</a>, <a href="https://arxiv.org/ps/2411.11544">ps</a>, <a href="https://arxiv.org/format/2411.11544">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Data Structures and Algorithms">cs.DS</span> </div> </div> <p class="title is-5 mathjax"> The Complexity Landscape of Dynamic Distributed Subgraph Finding </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chang%2C+Y">Yi-Jun Chang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lyuting Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yanyu Chen</a>, <a href="/search/cs?searchtype=author&query=Mishra%2C+G">Gopinath Mishra</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Mingyang Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.11544v1-abstract-short" style="display: inline;"> Bonne and Censor-Hillel (ICALP 2019) initiated the study of distributed subgraph finding in dynamic networks of limited bandwidth. For the case where the target subgraph is a clique, they determined the tight bandwidth complexity bounds in nearly all settings. However, several open questions remain, and very little is known about finding subgraphs beyond cliques. In this work, we consider these qu… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11544v1-abstract-full').style.display = 'inline'; document.getElementById('2411.11544v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.11544v1-abstract-full" style="display: none;"> Bonne and Censor-Hillel (ICALP 2019) initiated the study of distributed subgraph finding in dynamic networks of limited bandwidth. For the case where the target subgraph is a clique, they determined the tight bandwidth complexity bounds in nearly all settings. However, several open questions remain, and very little is known about finding subgraphs beyond cliques. In this work, we consider these questions and explore subgraphs beyond cliques. For finding cliques, we establish an $惟(\log \log n)$ bandwidth lower bound for one-round membership-detection under edge insertions only and an $惟(\log \log \log n)$ bandwidth lower bound for one-round detection under both edge insertions and node insertions. Moreover, we demonstrate new algorithms to show that our lower bounds are tight in bounded-degree networks when the target subgraph is a triangle. Prior to our work, no lower bounds were known for these problems. For finding subgraphs beyond cliques, we present a complete characterization of the bandwidth complexity of the membership-listing problem for every target subgraph, every number of rounds, and every type of topological change: node insertions, node deletions, edge insertions, and edge deletions. We also show partial characterizations for one-round membership-detection and listing. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.11544v1-abstract-full').style.display = 'none'; document.getElementById('2411.11544v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 18 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">37 Pages</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.09852">arXiv:2411.09852</a> <span> [<a href="https://arxiv.org/pdf/2411.09852">pdf</a>, <a href="https://arxiv.org/format/2411.09852">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> InterFormer: Towards Effective Heterogeneous Interaction Learning for Click-Through Rate Prediction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zeng%2C+Z">Zhichen Zeng</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaolong Liu</a>, <a href="/search/cs?searchtype=author&query=Hang%2C+M">Mengyue Hang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xiaoyi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhou%2C+Q">Qinghai Zhou</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chaofei Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yiqun Liu</a>, <a href="/search/cs?searchtype=author&query=Ruan%2C+Y">Yichen Ruan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Laming Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Y">Yuxin Chen</a>, <a href="/search/cs?searchtype=author&query=Hao%2C+Y">Yujia Hao</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+J">Jiaqi Xu</a>, <a href="/search/cs?searchtype=author&query=Nie%2C+J">Jade Nie</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+X">Xi Liu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+B">Buyun Zhang</a>, <a href="/search/cs?searchtype=author&query=Wen%2C+W">Wei Wen</a>, <a href="/search/cs?searchtype=author&query=Yuan%2C+S">Siyang Yuan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+K">Kai Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+W">Wen-Yen Chen</a>, <a href="/search/cs?searchtype=author&query=Han%2C+Y">Yiping Han</a>, <a href="/search/cs?searchtype=author&query=Li%2C+H">Huayu Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+C">Chunzhi Yang</a>, <a href="/search/cs?searchtype=author&query=Long%2C+B">Bo Long</a>, <a href="/search/cs?searchtype=author&query=Yu%2C+P+S">Philip S. Yu</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+H">Hanghang Tong</a> , et al. (1 additional authors not shown) </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.09852v1-abstract-short" style="display: inline;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. How… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'inline'; document.getElementById('2411.09852v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.09852v1-abstract-full" style="display: none;"> Click-through rate (CTR) prediction, which predicts the probability of a user clicking an ad, is a fundamental task in recommender systems. The emergence of heterogeneous information, such as user profile and behavior sequences, depicts user interests from different aspects. A mutually beneficial integration of heterogeneous information is the cornerstone towards the success of CTR prediction. However, most of the existing methods suffer from two fundamental limitations, including (1) insufficient inter-mode interaction due to the unidirectional information flow between modes, and (2) aggressive information aggregation caused by early summarization, resulting in excessive information loss. To address the above limitations, we propose a novel module named InterFormer to learn heterogeneous information interaction in an interleaving style. To achieve better interaction learning, InterFormer enables bidirectional information flow for mutually beneficial learning across different modes. To avoid aggressive information aggregation, we retain complete information in each data mode and use a separate bridging arch for effective information selection and summarization. Our proposed InterFormer achieves state-of-the-art performance on three public datasets and a large-scale industrial dataset. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.09852v1-abstract-full').style.display = 'none'; document.getElementById('2411.09852v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 14 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">10 pages, 6 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.08302">arXiv:2411.08302</a> <span> [<a href="https://arxiv.org/pdf/2411.08302">pdf</a>, <a href="https://arxiv.org/format/2411.08302">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> R3HF: Reward Redistribution for Enhancing Reinforcement Learning from Human Feedback </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Li%2C+J">Jiahui Li</a>, <a href="/search/cs?searchtype=author&query=Chang%2C+T">Tai-wei Chang</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+F">Fengda Zhang</a>, <a href="/search/cs?searchtype=author&query=Kuang%2C+K">Kun Kuang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.08302v1-abstract-short" style="display: inline;"> Reinforcement learning from human feedback (RLHF) provides a paradigm for aligning large language models (LLMs) with human preferences. This involves the initial training of a reward model based on pairwise human feedback. The reward model is subsequently utilized in reinforcement learning to assess the scores of each generated sentence as a whole, further guiding the optimization of LLMs. However… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08302v1-abstract-full').style.display = 'inline'; document.getElementById('2411.08302v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.08302v1-abstract-full" style="display: none;"> Reinforcement learning from human feedback (RLHF) provides a paradigm for aligning large language models (LLMs) with human preferences. This involves the initial training of a reward model based on pairwise human feedback. The reward model is subsequently utilized in reinforcement learning to assess the scores of each generated sentence as a whole, further guiding the optimization of LLMs. However, current approaches have a significant shortcoming: \emph{They allocate a single, sparse, and delayed reward to an entire sequence of output}. This may overlook some significant individual contributions of each token towards the desired outcome. To overcome this limitation, our paper proposes a novel reward redistribution method called R3HF, which facilitates a more fine-grained, token-level reward allocation. Specifically, our method treats the reward prediction task of the reward model as a regression problem. As a result, the redistributed rewards are computed by evaluating the specific contribution of each token to the reward model's output. This detailed approach improves the model's understanding of language nuances, leading to more precise enhancements in its performance. Our method is crafted to integrate seamlessly with most current techniques while incurring minimal computational costs. Through comprehensive experiments across diverse datasets and tasks, we have verified the effectiveness and superiority of our approach. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.08302v1-abstract-full').style.display = 'none'; document.getElementById('2411.08302v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07843">arXiv:2411.07843</a> <span> [<a href="https://arxiv.org/pdf/2411.07843">pdf</a>, <a href="https://arxiv.org/format/2411.07843">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Chain Association-based Attacking and Shielding Natural Language Processing Systems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Huang%2C+J">Jiacheng Huang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Long Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07843v1-abstract-short" style="display: inline;"> Association as a gift enables people do not have to mention something in completely straightforward words and allows others to understand what they intend to refer to. In this paper, we propose a chain association-based adversarial attack against natural language processing systems, utilizing the comprehension gap between humans and machines. We first generate a chain association graph for Chinese… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07843v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07843v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07843v1-abstract-full" style="display: none;"> Association as a gift enables people do not have to mention something in completely straightforward words and allows others to understand what they intend to refer to. In this paper, we propose a chain association-based adversarial attack against natural language processing systems, utilizing the comprehension gap between humans and machines. We first generate a chain association graph for Chinese characters based on the association paradigm for building search space of potential adversarial examples. Then, we introduce an discrete particle swarm optimization algorithm to search for the optimal adversarial examples. We conduct comprehensive experiments and show that advanced natural language processing models and applications, including large language models, are vulnerable to our attack, while humans appear good at understanding the perturbed text. We also explore two methods, including adversarial training and associative graph-based recovery, to shield systems from chain association-based attack. Since a few examples that use some derogatory terms, this paper contains materials that may be offensive or upsetting to some people. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07843v1-abstract-full').style.display = 'none'; document.getElementById('2411.07843v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07672">arXiv:2411.07672</a> <span> [<a href="https://arxiv.org/pdf/2411.07672">pdf</a>, <a href="https://arxiv.org/format/2411.07672">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Rethinking Structure Learning For Graph Neural Networks </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yilun Zheng</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhuofan Zhang</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Ziming Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lihui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07672v1-abstract-short" style="display: inline;"> To improve the performance of Graph Neural Networks (GNNs), Graph Structure Learning (GSL) has been extensively applied to reconstruct or refine original graph structures, effectively addressing issues like heterophily, over-squashing, and noisy structures. While GSL is generally thought to improve GNN performance, it often leads to longer training times and more hyperparameter tuning. Besides, th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07672v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07672v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07672v1-abstract-full" style="display: none;"> To improve the performance of Graph Neural Networks (GNNs), Graph Structure Learning (GSL) has been extensively applied to reconstruct or refine original graph structures, effectively addressing issues like heterophily, over-squashing, and noisy structures. While GSL is generally thought to improve GNN performance, it often leads to longer training times and more hyperparameter tuning. Besides, the distinctions among current GSL methods remain ambiguous from the perspective of GNN training, and there is a lack of theoretical analysis to quantify their effectiveness. Recent studies further suggest that, under fair comparisons with the same hyperparameter tuning, GSL does not consistently outperform baseline GNNs. This motivates us to ask a critical question: is GSL really useful for GNNs? To address this question, this paper makes two key contributions. First, we propose a new GSL framework, which includes three steps: GSL base (the representation used for GSL) construction, new structure construction, and view fusion, to better understand the effectiveness of GSL in GNNs. Second, after graph convolution, we analyze the differences in mutual information (MI) between node representations derived from the original topology and those from the newly constructed topology. Surprisingly, our empirical observations and theoretical analysis show that no matter which type of graph structure construction methods are used, after feeding the same GSL bases to the newly constructed graph, there is no MI gain compared to the original GSL bases. To fairly reassess the effectiveness of GSL, we conduct ablation experiments and find that it is the pretrained GSL bases that enhance GNN performance, and in most cases, GSL cannot improve GNN performance. This finding encourages us to rethink the essential components in GNNs, such as self-training and structural encoding, in GNN design rather than GSL. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07672v1-abstract-full').style.display = 'none'; document.getElementById('2411.07672v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07663">arXiv:2411.07663</a> <span> [<a href="https://arxiv.org/pdf/2411.07663">pdf</a>, <a href="https://arxiv.org/format/2411.07663">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Social and Information Networks">cs.SI</span> </div> </div> <p class="title is-5 mathjax"> Is Graph Convolution Always Beneficial For Every Feature? </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Zheng%2C+Y">Yilun Zheng</a>, <a href="/search/cs?searchtype=author&query=Li%2C+X">Xiang Li</a>, <a href="/search/cs?searchtype=author&query=Luan%2C+S">Sitao Luan</a>, <a href="/search/cs?searchtype=author&query=Peng%2C+X">Xiaojiang Peng</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lihui Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07663v1-abstract-short" style="display: inline;"> Graph Neural Networks (GNNs) have demonstrated strong capabilities in processing structured data. While traditional GNNs typically treat each feature dimension equally during graph convolution, we raise an important question: Is the graph convolution operation equally beneficial for each feature? If not, the convolution operation on certain feature dimensions can possibly lead to harmful effects,… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07663v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07663v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07663v1-abstract-full" style="display: none;"> Graph Neural Networks (GNNs) have demonstrated strong capabilities in processing structured data. While traditional GNNs typically treat each feature dimension equally during graph convolution, we raise an important question: Is the graph convolution operation equally beneficial for each feature? If not, the convolution operation on certain feature dimensions can possibly lead to harmful effects, even worse than the convolution-free models. In prior studies, to assess the impacts of graph convolution on features, people proposed metrics based on feature homophily to measure feature consistency with the graph topology. However, these metrics have shown unsatisfactory alignment with GNN performance and have not been effectively employed to guide feature selection in GNNs. To address these limitations, we introduce a novel metric, Topological Feature Informativeness (TFI), to distinguish between GNN-favored and GNN-disfavored features, where its effectiveness is validated through both theoretical analysis and empirical observations. Based on TFI, we propose a simple yet effective Graph Feature Selection (GFS) method, which processes GNN-favored and GNN-disfavored features separately, using GNNs and non-GNN models. Compared to original GNNs, GFS significantly improves the extraction of useful topological information from each feature with comparable computational costs. Extensive experiments show that after applying GFS to 8 baseline and state-of-the-art (SOTA) GNN architectures across 10 datasets, 83.75% of the GFS-augmented cases show significant performance boosts. Furthermore, our proposed TFI metric outperforms other feature selection methods. These results validate the effectiveness of both GFS and TFI. Additionally, we demonstrate that GFS's improvements are robust to hyperparameter tuning, highlighting its potential as a universal method for enhancing various GNN architectures. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07663v1-abstract-full').style.display = 'none'; document.getElementById('2411.07663v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 12 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.07025">arXiv:2411.07025</a> <span> [<a href="https://arxiv.org/pdf/2411.07025">pdf</a>, <a href="https://arxiv.org/format/2411.07025">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Graphics">cs.GR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Scaling Mesh Generation via Compressive Tokenization </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Weng%2C+H">Haohan Weng</a>, <a href="/search/cs?searchtype=author&query=Zhao%2C+Z">Zibo Zhao</a>, <a href="/search/cs?searchtype=author&query=Lei%2C+B">Biwen Lei</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+X">Xianghui Yang</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+J">Jian Liu</a>, <a href="/search/cs?searchtype=author&query=Lai%2C+Z">Zeqiang Lai</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhuo Chen</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+Y">Yuhong Liu</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+J">Jie Jiang</a>, <a href="/search/cs?searchtype=author&query=Guo%2C+C">Chunchao Guo</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+T">Tong Zhang</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+S">Shenghua Gao</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+C+L+P">C. L. Philip Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.07025v1-abstract-short" style="display: inline;"> We propose a compressive yet effective mesh representation, Blocked and Patchified Tokenization (BPT), facilitating the generation of meshes exceeding 8k faces. BPT compresses mesh sequences by employing block-wise indexing and patch aggregation, reducing their length by approximately 75\% compared to the original sequences. This compression milestone unlocks the potential to utilize mesh data wit… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07025v1-abstract-full').style.display = 'inline'; document.getElementById('2411.07025v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.07025v1-abstract-full" style="display: none;"> We propose a compressive yet effective mesh representation, Blocked and Patchified Tokenization (BPT), facilitating the generation of meshes exceeding 8k faces. BPT compresses mesh sequences by employing block-wise indexing and patch aggregation, reducing their length by approximately 75\% compared to the original sequences. This compression milestone unlocks the potential to utilize mesh data with significantly more faces, thereby enhancing detail richness and improving generation robustness. Empowered with the BPT, we have built a foundation mesh generative model training on scaled mesh data to support flexible control for point clouds and images. Our model demonstrates the capability to generate meshes with intricate details and accurate topology, achieving SoTA performance on mesh generation and reaching the level for direct product usage. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.07025v1-abstract-full').style.display = 'none'; document.getElementById('2411.07025v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 11 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Homepage: https://whaohan.github.io/bpt , Code: https://github.com/whaohan/bpt</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.06213">arXiv:2411.06213</a> <span> [<a href="https://arxiv.org/pdf/2411.06213">pdf</a>, <a href="https://arxiv.org/ps/2411.06213">ps</a>, <a href="https://arxiv.org/format/2411.06213">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Incorporating Human Explanations for Robust Hate Speech Detection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+J+L">Jennifer L. Chen</a>, <a href="/search/cs?searchtype=author&query=Ladhak%2C+F">Faisal Ladhak</a>, <a href="/search/cs?searchtype=author&query=Li%2C+D">Daniel Li</a>, <a href="/search/cs?searchtype=author&query=Elhadad%2C+N">No茅mie Elhadad</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.06213v1-abstract-short" style="display: inline;"> Given the black-box nature and complexity of large transformer language models (LM), concerns about generalizability and robustness present ethical implications for domains such as hate speech (HS) detection. Using the content rich Social Bias Frames dataset, containing human-annotated stereotypes, intent, and targeted groups, we develop a three stage analysis to evaluate if LMs faithfully assess… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06213v1-abstract-full').style.display = 'inline'; document.getElementById('2411.06213v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.06213v1-abstract-full" style="display: none;"> Given the black-box nature and complexity of large transformer language models (LM), concerns about generalizability and robustness present ethical implications for domains such as hate speech (HS) detection. Using the content rich Social Bias Frames dataset, containing human-annotated stereotypes, intent, and targeted groups, we develop a three stage analysis to evaluate if LMs faithfully assess hate speech. First, we observe the need for modeling contextually grounded stereotype intents to capture implicit semantic meaning. Next, we design a new task, Stereotype Intent Entailment (SIE), which encourages a model to contextually understand stereotype presence. Finally, through ablation tests and user studies, we find a SIE objective improves content understanding, but challenges remain in modeling implicit intent. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.06213v1-abstract-full').style.display = 'none'; document.getElementById('2411.06213v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 9 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">2021 ACL Unimplicit Workshop</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05959">arXiv:2411.05959</a> <span> [<a href="https://arxiv.org/pdf/2411.05959">pdf</a>, <a href="https://arxiv.org/format/2411.05959">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Image and Video Processing">eess.IV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Efficient Self-Supervised Barlow Twins from Limited Tissue Slide Cohorts for Colonic Pathology Diagnostics </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Notton%2C+C">Cassandre Notton</a>, <a href="/search/cs?searchtype=author&query=Sharma%2C+V">Vasudev Sharma</a>, <a href="/search/cs?searchtype=author&query=Trinh%2C+V+Q">Vincent Quoc-Huy Trinh</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lina Chen</a>, <a href="/search/cs?searchtype=author&query=Xu%2C+M">Minqi Xu</a>, <a href="/search/cs?searchtype=author&query=Varma%2C+S">Sonal Varma</a>, <a href="/search/cs?searchtype=author&query=Hosseini%2C+M+S">Mahdi S. Hosseini</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05959v1-abstract-short" style="display: inline;"> Colorectal cancer (CRC) is one of the few cancers that have an established dysplasia-carcinoma sequence that benefits from screening. Everyone over 50 years of age in Canada is eligible for CRC screening. About 20\% of those people will undergo a biopsy for a pre-neoplastic polyp and, in many cases, multiple polyps. As such, these polyp biopsies make up the bulk of a pathologist's workload. Develo… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05959v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05959v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05959v1-abstract-full" style="display: none;"> Colorectal cancer (CRC) is one of the few cancers that have an established dysplasia-carcinoma sequence that benefits from screening. Everyone over 50 years of age in Canada is eligible for CRC screening. About 20\% of those people will undergo a biopsy for a pre-neoplastic polyp and, in many cases, multiple polyps. As such, these polyp biopsies make up the bulk of a pathologist's workload. Developing an efficient computational model to help screen these polyp biopsies can improve the pathologist's workflow and help guide their attention to critical areas on the slide. DL models face significant challenges in computational pathology (CPath) because of the gigapixel image size of whole-slide images and the scarcity of detailed annotated datasets. It is, therefore, crucial to leverage self-supervised learning (SSL) methods to alleviate the burden and cost of data annotation. However, current research lacks methods to apply SSL frameworks to analyze pathology data effectively. This paper aims to propose an optimized Barlow Twins framework for colorectal polyps screening. We adapt its hyperparameters, augmentation strategy and encoder to the specificity of the pathology data to enhance performance. Additionally, we investigate the best Field of View (FoV) for colorectal polyps screening and propose a new benchmark dataset for CRC screening, made of four types of colorectal polyps and normal tissue, by performing downstream tasking on MHIST and NCT-CRC-7K datasets. Furthermore, we show that the SSL representations are more meaningful and qualitative than the supervised ones and that Barlow Twins benefits from the Swin Transformer when applied to pathology data. Codes are avaialble from https://github.com/AtlasAnalyticsLab/PathBT. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05959v1-abstract-full').style.display = 'none'; document.getElementById('2411.05959v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Submission Under Review</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.05928">arXiv:2411.05928</a> <span> [<a href="https://arxiv.org/pdf/2411.05928">pdf</a>, <a href="https://arxiv.org/format/2411.05928">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Reducing Distraction in Long-Context Language Models by Focused Learning </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+Z">Zijun Wu</a>, <a href="/search/cs?searchtype=author&query=Liu%2C+B">Bingyuan Liu</a>, <a href="/search/cs?searchtype=author&query=Yan%2C+R">Ran Yan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Delteil%2C+T">Thomas Delteil</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.05928v1-abstract-short" style="display: inline;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their capacity to process long contexts. However, effectively utilizing this long context remains a challenge due to the issue of distraction, where irrelevant information dominates lengthy contexts, causing LLMs to lose focus on the most relevant segments. To address this, we propose a novel training method that enhan… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05928v1-abstract-full').style.display = 'inline'; document.getElementById('2411.05928v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.05928v1-abstract-full" style="display: none;"> Recent advancements in Large Language Models (LLMs) have significantly enhanced their capacity to process long contexts. However, effectively utilizing this long context remains a challenge due to the issue of distraction, where irrelevant information dominates lengthy contexts, causing LLMs to lose focus on the most relevant segments. To address this, we propose a novel training method that enhances LLMs' ability to discern relevant information through a unique combination of retrieval-based data augmentation and contrastive learning. Specifically, during fine-tuning with long contexts, we employ a retriever to extract the most relevant segments, serving as augmented inputs. We then introduce an auxiliary contrastive learning objective to explicitly ensure that outputs from the original context and the retrieved sub-context are closely aligned. Extensive experiments on long single-document and multi-document QA benchmarks demonstrate the effectiveness of our proposed method. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.05928v1-abstract-full').style.display = 'none'; document.getElementById('2411.05928v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 8 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03707">arXiv:2411.03707</a> <span> [<a href="https://arxiv.org/pdf/2411.03707">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Fine-Tuning Vision-Language Model for Automated Engineering Drawing Information Extraction </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khan%2C+M+T">Muhammad Tayyab Khan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lequn Chen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+Y+H">Ye Han Ng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenhe Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+N+Y+J">Nicholas Yew Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+S+K">Seung Ki Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03707v1-abstract-short" style="display: inline;"> Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in manufacturing by defining acceptable variations in part features to ensure component quality and functionality. However, extracting GD&T information from 2D engineering drawings is a time-consuming and labor-intensive task, often relying on manual efforts or semi-automated tools. To address these challenges, this study proposes… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03707v1-abstract-full').style.display = 'inline'; document.getElementById('2411.03707v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03707v1-abstract-full" style="display: none;"> Geometric Dimensioning and Tolerancing (GD&T) plays a critical role in manufacturing by defining acceptable variations in part features to ensure component quality and functionality. However, extracting GD&T information from 2D engineering drawings is a time-consuming and labor-intensive task, often relying on manual efforts or semi-automated tools. To address these challenges, this study proposes an automated and computationally efficient GD&T extraction method by fine-tuning Florence-2, an open-source vision-language model (VLM). The model is trained on a dataset of 400 drawings with ground truth annotations provided by domain experts. For comparison, two state-of-the-art closed-source VLMs, GPT-4o and Claude-3.5-Sonnet, are evaluated on the same dataset. All models are assessed using precision, recall, F1-score, and hallucination metrics. Due to the computational cost and impracticality of fine-tuning large closed-source VLMs for domain-specific tasks, GPT-4o and Claude-3.5-Sonnet are evaluated in a zero-shot setting. In contrast, Florence-2, a smaller model with 0.23 billion parameters, is optimized through full-parameter fine-tuning across three distinct experiments, each utilizing datasets augmented to different levels. The results show that Florence-2 achieves a 29.95% increase in precision, a 37.75% increase in recall, a 52.40% improvement in F1-score, and a 43.15% reduction in hallucination rate compared to the best-performing closed-source model. These findings highlight the effectiveness of fine-tuning smaller, open-source VLMs like Florence-2, offering a practical and efficient solution for automated GD&T extraction to support downstream manufacturing tasks. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03707v1-abstract-full').style.display = 'none'; document.getElementById('2411.03707v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 6 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper has been submitted to the 9th International Conference on Innovation in Artificial Intelligence (ICIAI 2025)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.03389">arXiv:2411.03389</a> <span> [<a href="https://arxiv.org/pdf/2411.03389">pdf</a>, <a href="https://arxiv.org/format/2411.03389">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Physics">physics.comp-ph</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Neurons for Neutrons: A Transformer Model for Computation Load Estimation on Domain-Decomposed Neutron Transport Problems </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Mote%2C+A">Alexander Mote</a>, <a href="/search/cs?searchtype=author&query=Palmer%2C+T">Todd Palmer</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lizhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.03389v2-abstract-short" style="display: inline;"> Domain decomposition is a technique used to reduce memory overhead on large neutron transport problems. Currently, the optimal load-balanced processor allocation for these domains is typically determined through small-scale simulations of the problem, which can be time-consuming for researchers and must be repeated anytime a problem input is changed. We propose a Transformer model with a unique 3D… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03389v2-abstract-full').style.display = 'inline'; document.getElementById('2411.03389v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.03389v2-abstract-full" style="display: none;"> Domain decomposition is a technique used to reduce memory overhead on large neutron transport problems. Currently, the optimal load-balanced processor allocation for these domains is typically determined through small-scale simulations of the problem, which can be time-consuming for researchers and must be repeated anytime a problem input is changed. We propose a Transformer model with a unique 3D input embedding, and input representations designed for domain-decomposed neutron transport problems, which can predict the subdomain computation loads generated by small-scale simulations. We demonstrate that such a model trained on domain-decomposed Small Modular Reactor (SMR) simulations achieves 98.2% accuracy while being able to skip the small-scale simulation step entirely. Tests of the model's robustness on variant fuel assemblies, other problem geometries, and changes in simulation parameters are also discussed. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.03389v2-abstract-full').style.display = 'none'; document.getElementById('2411.03389v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 7 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">28 pages, 14 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02886">arXiv:2411.02886</a> <span> [<a href="https://arxiv.org/pdf/2411.02886">pdf</a>, <a href="https://arxiv.org/format/2411.02886">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> TokenSelect: Efficient Long-Context Inference and Length Extrapolation for LLMs via Dynamic Token-Level KV Cache Selection </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+W">Wei Wu</a>, <a href="/search/cs?searchtype=author&query=Pan%2C+Z">Zhuoshi Pan</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chao Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyi Chen</a>, <a href="/search/cs?searchtype=author&query=Bai%2C+Y">Yunchu Bai</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+K">Kun Fu</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+Z">Zheng Wang</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02886v1-abstract-short" style="display: inline;"> With the development of large language models (LLMs), the ability to handle longer contexts has become a key capability for Web applications such as cross-document understanding and LLM-powered search systems. However, this progress faces two major challenges: performance degradation due to sequence lengths out-of-distribution, and excessively long inference times caused by the quadratic computati… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02886v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02886v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02886v1-abstract-full" style="display: none;"> With the development of large language models (LLMs), the ability to handle longer contexts has become a key capability for Web applications such as cross-document understanding and LLM-powered search systems. However, this progress faces two major challenges: performance degradation due to sequence lengths out-of-distribution, and excessively long inference times caused by the quadratic computational complexity of attention. These issues hinder the application of LLMs in long-context scenarios. In this paper, we propose Dynamic Token-Level KV Cache Selection (TokenSelect), a model-agnostic, training-free method for efficient and accurate long-context inference. TokenSelect builds upon the observation of non-contiguous attention sparsity, using Query-Key dot products to measure per-head KV Cache criticality at token-level. By per-head soft voting mechanism, TokenSelect selectively involves a small number of critical KV cache tokens in the attention calculation without sacrificing accuracy. To further accelerate TokenSelect, we designed the Selection Cache based on observations of consecutive Query similarity and implemented efficient dot product kernel, significantly reducing the overhead of token selection. A comprehensive evaluation of TokenSelect demonstrates up to 23.84x speedup in attention computation and up to 2.28x acceleration in end-to-end latency, while providing superior performance compared to state-of-the-art long-context inference methods. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02886v1-abstract-full').style.display = 'none'; document.getElementById('2411.02886v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 5 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02810">arXiv:2411.02810</a> <span> [<a href="https://arxiv.org/pdf/2411.02810">pdf</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computational Engineering, Finance, and Science">cs.CE</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> </div> </div> <p class="title is-5 mathjax"> Leveraging Vision-Language Models for Manufacturing Feature Recognition in CAD Designs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Khan%2C+M+T">Muhammad Tayyab Khan</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lequn Chen</a>, <a href="/search/cs?searchtype=author&query=Ng%2C+Y+H">Ye Han Ng</a>, <a href="/search/cs?searchtype=author&query=Feng%2C+W">Wenhe Feng</a>, <a href="/search/cs?searchtype=author&query=Tan%2C+N+Y+J">Nicholas Yew Jin Tan</a>, <a href="/search/cs?searchtype=author&query=Moon%2C+S+K">Seung Ki Moon</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02810v1-abstract-short" style="display: inline;"> Automatic feature recognition (AFR) is essential for transforming design knowledge into actionable manufacturing information. Traditional AFR methods, which rely on predefined geometric rules and large datasets, are often time-consuming and lack generalizability across various manufacturing features. To address these challenges, this study investigates vision-language models (VLMs) for automating… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02810v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02810v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02810v1-abstract-full" style="display: none;"> Automatic feature recognition (AFR) is essential for transforming design knowledge into actionable manufacturing information. Traditional AFR methods, which rely on predefined geometric rules and large datasets, are often time-consuming and lack generalizability across various manufacturing features. To address these challenges, this study investigates vision-language models (VLMs) for automating the recognition of a wide range of manufacturing features in CAD designs without the need for extensive training datasets or predefined rules. Instead, prompt engineering techniques, such as multi-view query images, few-shot learning, sequential reasoning, and chain-of-thought, are applied to enable recognition. The approach is evaluated on a newly developed CAD dataset containing designs of varying complexity relevant to machining, additive manufacturing, sheet metal forming, molding, and casting. Five VLMs, including three closed-source models (GPT-4o, Claude-3.5-Sonnet, and Claude-3.0-Opus) and two open-source models (LLava and MiniCPM), are evaluated on this dataset with ground truth features labelled by experts. Key metrics include feature quantity accuracy, feature name matching accuracy, hallucination rate, and mean absolute error (MAE). Results show that Claude-3.5-Sonnet achieves the highest feature quantity accuracy (74%) and name-matching accuracy (75%) with the lowest MAE (3.2), while GPT-4o records the lowest hallucination rate (8%). In contrast, open-source models have higher hallucination rates (>30%) and lower accuracies (<40%). This study demonstrates the potential of VLMs to automate feature recognition in CAD designs within diverse manufacturing scenarios. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02810v1-abstract-full').style.display = 'none'; document.getElementById('2411.02810v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Paper has been submitted to The ASME Journal of Computing and Information Science in Engineering (JCISE)</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02041">arXiv:2411.02041</a> <span> [<a href="https://arxiv.org/pdf/2411.02041">pdf</a>, <a href="https://arxiv.org/format/2411.02041">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Information Retrieval">cs.IR</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Enhancing ID-based Recommendation with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Gao%2C+C">Chen Gao</a>, <a href="/search/cs?searchtype=author&query=Du%2C+X">Xiaoyi Du</a>, <a href="/search/cs?searchtype=author&query=Luo%2C+H">Hengliang Luo</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+D">Depeng Jin</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yong Li</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+M">Meng Wang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02041v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have recently garnered significant attention in various domains, including recommendation systems. Recent research leverages the capabilities of LLMs to improve the performance and user modeling aspects of recommender systems. These studies primarily focus on utilizing LLMs to interpret textual data in recommendation tasks. However, it's worth noting that in ID-based r… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02041v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02041v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02041v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have recently garnered significant attention in various domains, including recommendation systems. Recent research leverages the capabilities of LLMs to improve the performance and user modeling aspects of recommender systems. These studies primarily focus on utilizing LLMs to interpret textual data in recommendation tasks. However, it's worth noting that in ID-based recommendations, textual data is absent, and only ID data is available. The untapped potential of LLMs for ID data within the ID-based recommendation paradigm remains relatively unexplored. To this end, we introduce a pioneering approach called "LLM for ID-based Recommendation" (LLM4IDRec). This innovative approach integrates the capabilities of LLMs while exclusively relying on ID data, thus diverging from the previous reliance on textual data. The basic idea of LLM4IDRec is that by employing LLM to augment ID data, if augmented ID data can improve recommendation performance, it demonstrates the ability of LLM to interpret ID data effectively, exploring an innovative way for the integration of LLM in ID-based recommendation. We evaluate the effectiveness of our LLM4IDRec approach using three widely-used datasets. Our results demonstrate a notable improvement in recommendation performance, with our approach consistently outperforming existing methods in ID-based recommendation by solely augmenting input data. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02041v1-abstract-full').style.display = 'none'; document.getElementById('2411.02041v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.02006">arXiv:2411.02006</a> <span> [<a href="https://arxiv.org/pdf/2411.02006">pdf</a>, <a href="https://arxiv.org/format/2411.02006">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Foundations and Recent Trends in Multimodal Mobile Agents: A Survey </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wu%2C+B">Biao Wu</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yanda Li</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zhiwei Zhang</a>, <a href="/search/cs?searchtype=author&query=Wei%2C+Y">Yunchao Wei</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.02006v1-abstract-short" style="display: inline;"> Mobile agents are essential for automating tasks in complex and dynamic mobile environments. As foundation models evolve, the demands for agents that can adapt in real-time and process multimodal data have grown. This survey provides a comprehensive review of mobile agent technologies, focusing on recent advancements that enhance real-time adaptability and multimodal interaction. Recent evaluation… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02006v1-abstract-full').style.display = 'inline'; document.getElementById('2411.02006v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.02006v1-abstract-full" style="display: none;"> Mobile agents are essential for automating tasks in complex and dynamic mobile environments. As foundation models evolve, the demands for agents that can adapt in real-time and process multimodal data have grown. This survey provides a comprehensive review of mobile agent technologies, focusing on recent advancements that enhance real-time adaptability and multimodal interaction. Recent evaluation benchmarks have been developed better to capture the static and interactive environments of mobile tasks, offering more accurate assessments of agents' performance. We then categorize these advancements into two main approaches: prompt-based methods, which utilize large language models (LLMs) for instruction-based task execution, and training-based methods, which fine-tune multimodal models for mobile-specific applications. Additionally, we explore complementary technologies that augment agent performance. By discussing key challenges and outlining future research directions, this survey offers valuable insights for advancing mobile agent technologies. A comprehensive resource list is available at https://github.com/aialt/awesome-mobile-agents <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.02006v1-abstract-full').style.display = 'none'; document.getElementById('2411.02006v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">8 pages, 1 figure</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01988">arXiv:2411.01988</a> <span> [<a href="https://arxiv.org/pdf/2411.01988">pdf</a>, <a href="https://arxiv.org/format/2411.01988">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> QCS:Feature Refining from Quadruplet Cross Similarity for Facial Expression Recognition </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Wang%2C+C">Chengpeng Wang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Li Chen</a>, <a href="/search/cs?searchtype=author&query=Wang%2C+L">Lili Wang</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Z">Zhaofan Li</a>, <a href="/search/cs?searchtype=author&query=Lv%2C+X">Xuebin Lv</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01988v1-abstract-short" style="display: inline;"> On facial expression datasets with complex and numerous feature types, where the significance and dominance of labeled features are difficult to predict, facial expression recognition(FER) encounters the challenges of inter-class similarity and intra-class variances, making it difficult to mine effective features. We aim to solely leverage the feature similarity among facial samples to address thi… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01988v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01988v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01988v1-abstract-full" style="display: none;"> On facial expression datasets with complex and numerous feature types, where the significance and dominance of labeled features are difficult to predict, facial expression recognition(FER) encounters the challenges of inter-class similarity and intra-class variances, making it difficult to mine effective features. We aim to solely leverage the feature similarity among facial samples to address this. We introduce the Cross Similarity Attention (CSA), an input-output position-sensitive attention mechanism that harnesses feature similarity across different images to compute the corresponding global spatial attention. Based on this, we propose a four-branch circular framework, called Quadruplet Cross Similarity (QCS), to extract discriminative features from the same class and eliminate redundant ones from different classes synchronously to refine cleaner features. The symmetry of the network ensures balanced and stable training and reduces the amount of CSA interaction matrix. Contrastive residual distillation is utilized to transfer the information learned in the cross module back to the base network. The cross-attention module exists during training, and only one base branch is retained during inference. our proposed QCS model outperforms state-of-the-art methods on several popular FER datasets, without requiring additional landmark information or other extra training data. The code is available at https://github.com/birdwcp/QCS. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01988v1-abstract-full').style.display = 'none'; document.getElementById('2411.01988v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.01267">arXiv:2411.01267</a> <span> [<a href="https://arxiv.org/pdf/2411.01267">pdf</a>, <a href="https://arxiv.org/format/2411.01267">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Machine Learning">stat.ML</span> </div> </div> <p class="title is-5 mathjax"> ProGen: Revisiting Probabilistic Spatial-Temporal Time Series Forecasting from a Continuous Generative Perspective Using Stochastic Differential Equations </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Gong%2C+M">Mingze Gong</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lei Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+J">Jia Li</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.01267v1-abstract-short" style="display: inline;"> Accurate forecasting of spatiotemporal data remains challenging due to complex spatial dependencies and temporal dynamics. The inherent uncertainty and variability in such data often render deterministic models insufficient, prompting a shift towards probabilistic approaches, where diffusion-based generative models have emerged as effective solutions. In this paper, we present ProGen, a novel fram… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01267v1-abstract-full').style.display = 'inline'; document.getElementById('2411.01267v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.01267v1-abstract-full" style="display: none;"> Accurate forecasting of spatiotemporal data remains challenging due to complex spatial dependencies and temporal dynamics. The inherent uncertainty and variability in such data often render deterministic models insufficient, prompting a shift towards probabilistic approaches, where diffusion-based generative models have emerged as effective solutions. In this paper, we present ProGen, a novel framework for probabilistic spatiotemporal time series forecasting that leverages Stochastic Differential Equations (SDEs) and diffusion-based generative modeling techniques in the continuous domain. By integrating a novel denoising score model, graph neural networks, and a tailored SDE, ProGen provides a robust solution that effectively captures spatiotemporal dependencies while managing uncertainty. Our extensive experiments on four benchmark traffic datasets demonstrate that ProGen outperforms state-of-the-art deterministic and probabilistic models. This work contributes a continuous, diffusion-based generative approach to spatiotemporal forecasting, paving the way for future research in probabilistic modeling and stochastic processes. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.01267v1-abstract-full').style.display = 'none'; document.getElementById('2411.01267v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 2 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00781">arXiv:2411.00781</a> <span> [<a href="https://arxiv.org/pdf/2411.00781">pdf</a>, <a href="https://arxiv.org/format/2411.00781">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Robotics">cs.RO</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> <span class="tag is-small is-grey tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> Hazards in Daily Life? Enabling Robots to Proactively Detect and Resolve Anomalies </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zirui Song</a>, <a href="/search/cs?searchtype=author&query=Ouyang%2C+G">Guangxian Ouyang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Meng Fang</a>, <a href="/search/cs?searchtype=author&query=Na%2C+H">Hongbin Na</a>, <a href="/search/cs?searchtype=author&query=Shi%2C+Z">Zijing Shi</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+Z">Zhenhao Chen</a>, <a href="/search/cs?searchtype=author&query=Fu%2C+Y">Yujie Fu</a>, <a href="/search/cs?searchtype=author&query=Zhang%2C+Z">Zeyu Zhang</a>, <a href="/search/cs?searchtype=author&query=Jiang%2C+S">Shiyu Jiang</a>, <a href="/search/cs?searchtype=author&query=Fang%2C+M">Miao Fang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+X">Xiuying Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00781v1-abstract-short" style="display: inline;"> Existing household robots have made significant progress in performing routine tasks, such as cleaning floors or delivering objects. However, a key limitation of these robots is their inability to recognize potential problems or dangers in home environments. For example, a child may pick up and ingest medication that has fallen on the floor, posing a serious risk. We argue that household robots sh… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00781v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00781v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00781v1-abstract-full" style="display: none;"> Existing household robots have made significant progress in performing routine tasks, such as cleaning floors or delivering objects. However, a key limitation of these robots is their inability to recognize potential problems or dangers in home environments. For example, a child may pick up and ingest medication that has fallen on the floor, posing a serious risk. We argue that household robots should proactively detect such hazards or anomalies within the home, and propose the task of anomaly scenario generation. We leverage foundational models instead of relying on manually labeled data to build simulated environments. Specifically, we introduce a multi-agent brainstorming approach, where agents collaborate and generate diverse scenarios covering household hazards, hygiene management, and child safety. These textual task descriptions are then integrated with designed 3D assets to simulate realistic environments. Within these constructed environments, the robotic agent learns the necessary skills to proactively discover and handle the proposed anomalies through task decomposition, and optimal learning approach selection. We demonstrate that our generated environment outperforms others in terms of task description and scene diversity, ultimately enabling robotic agents to better address potential household hazards. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00781v1-abstract-full').style.display = 'none'; document.getElementById('2411.00781v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 16 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">In processing</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00776">arXiv:2411.00776</a> <span> [<a href="https://arxiv.org/pdf/2411.00776">pdf</a>, <a href="https://arxiv.org/format/2411.00776">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> Randomized Autoregressive Visual Generation </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Yu%2C+Q">Qihang Yu</a>, <a href="/search/cs?searchtype=author&query=He%2C+J">Ju He</a>, <a href="/search/cs?searchtype=author&query=Deng%2C+X">Xueqing Deng</a>, <a href="/search/cs?searchtype=author&query=Shen%2C+X">Xiaohui Shen</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liang-Chieh Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00776v1-abstract-short" style="display: inline;"> This paper presents Randomized AutoRegressive modeling (RAR) for visual generation, which sets a new state-of-the-art performance on the image generation task while maintaining full compatibility with language modeling frameworks. The proposed RAR is simple: during a standard autoregressive training process with a next-token prediction objective, the input sequence-typically ordered in raster form… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00776v1-abstract-full').style.display = 'inline'; document.getElementById('2411.00776v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00776v1-abstract-full" style="display: none;"> This paper presents Randomized AutoRegressive modeling (RAR) for visual generation, which sets a new state-of-the-art performance on the image generation task while maintaining full compatibility with language modeling frameworks. The proposed RAR is simple: during a standard autoregressive training process with a next-token prediction objective, the input sequence-typically ordered in raster form-is randomly permuted into different factorization orders with a probability r, where r starts at 1 and linearly decays to 0 over the course of training. This annealing training strategy enables the model to learn to maximize the expected likelihood over all factorization orders and thus effectively improve the model's capability of modeling bidirectional contexts. Importantly, RAR preserves the integrity of the autoregressive modeling framework, ensuring full compatibility with language modeling while significantly improving performance in image generation. On the ImageNet-256 benchmark, RAR achieves an FID score of 1.48, not only surpassing prior state-of-the-art autoregressive image generators but also outperforming leading diffusion-based and masked transformer-based methods. Code and models will be made available at https://github.com/bytedance/1d-tokenizer <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00776v1-abstract-full').style.display = 'none'; document.getElementById('2411.00776v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 1 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">simple method improving autoregressive image generator to SOTA performance; Project page at https://yucornetto.github.io/projects/rar.html</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2411.00294">arXiv:2411.00294</a> <span> [<a href="https://arxiv.org/pdf/2411.00294">pdf</a>, <a href="https://arxiv.org/format/2411.00294">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computation and Language">cs.CL</span> </div> </div> <p class="title is-5 mathjax"> LLM-Ref: Enhancing Reference Handling in Technical Writing with Large Language Models </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Fuad%2C+K+A+A">Kazi Ahmed Asif Fuad</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Lizhong Chen</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2411.00294v2-abstract-short" style="display: inline;"> Large Language Models (LLMs) excel in data synthesis but can be inaccurate in domain-specific tasks, which retrieval-augmented generation (RAG) systems address by leveraging user-provided data. However, RAGs require optimization in both retrieval and generation stages, which can affect output quality. In this paper, we present LLM-Ref, a writing assistant tool that aids researchers in writing arti… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00294v2-abstract-full').style.display = 'inline'; document.getElementById('2411.00294v2-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2411.00294v2-abstract-full" style="display: none;"> Large Language Models (LLMs) excel in data synthesis but can be inaccurate in domain-specific tasks, which retrieval-augmented generation (RAG) systems address by leveraging user-provided data. However, RAGs require optimization in both retrieval and generation stages, which can affect output quality. In this paper, we present LLM-Ref, a writing assistant tool that aids researchers in writing articles from multiple source documents with enhanced reference synthesis and handling capabilities. Unlike traditional RAG systems that use chunking and indexing, our tool retrieves and generates content directly from text paragraphs. This method facilitates direct reference extraction from the generated outputs, a feature unique to our tool. Additionally, our tool employs iterative response generation, effectively managing lengthy contexts within the language model's constraints. Compared to baseline RAG-based systems, our approach achieves a $3.25\times$ to $6.26\times$ increase in Ragas score, a comprehensive metric that provides a holistic view of a RAG system's ability to produce accurate, relevant, and contextually appropriate responses. This improvement shows our method enhances the accuracy and contextual relevance of writing assistance tools. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2411.00294v2-abstract-full').style.display = 'none'; document.getElementById('2411.00294v2-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 4 November, 2024; <span class="has-text-black-bis has-text-weight-semibold">v1</span> submitted 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> November 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">20 pages, 7 figures</span> </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">ACM Class:</span> I.2.7 </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23992">arXiv:2410.23992</a> <span> [<a href="https://arxiv.org/pdf/2410.23992">pdf</a>, <a href="https://arxiv.org/format/2410.23992">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Machine Learning">cs.LG</span> </div> </div> <p class="title is-5 mathjax"> Ada-MSHyper: Adaptive Multi-Scale Hypergraph Transformer for Time Series Forecasting </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Shang%2C+Z">Zongjiang Shang</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Ling Chen</a>, <a href="/search/cs?searchtype=author&query=wu%2C+B">Binqing wu</a>, <a href="/search/cs?searchtype=author&query=Cui%2C+D">Dongliang Cui</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23992v1-abstract-short" style="display: inline;"> Although transformer-based methods have achieved great success in multi-scale temporal pattern interaction modeling, two key challenges limit their further development: (1) Individual time points contain less semantic information, and leveraging attention to model pair-wise interactions may cause the information utilization bottleneck. (2) Multiple inherent temporal variations (e.g., rising, falli… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23992v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23992v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23992v1-abstract-full" style="display: none;"> Although transformer-based methods have achieved great success in multi-scale temporal pattern interaction modeling, two key challenges limit their further development: (1) Individual time points contain less semantic information, and leveraging attention to model pair-wise interactions may cause the information utilization bottleneck. (2) Multiple inherent temporal variations (e.g., rising, falling, and fluctuating) entangled in temporal patterns. To this end, we propose Adaptive Multi-Scale Hypergraph Transformer (Ada-MSHyper) for time series forecasting. Specifically, an adaptive hypergraph learning module is designed to provide foundations for modeling group-wise interactions, then a multi-scale interaction module is introduced to promote more comprehensive pattern interactions at different scales. In addition, a node and hyperedge constraint mechanism is introduced to cluster nodes with similar semantic information and differentiate the temporal variations within each scales. Extensive experiments on 11 real-world datasets demonstrate that Ada-MSHyper achieves state-of-the-art performance, reducing prediction errors by an average of 4.56%, 10.38%, and 4.97% in MSE for long-range, short-range, and ultra-long-range time series forecasting, respectively. Code is available at https://github.com/shangzongjiang/Ada-MSHyper. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23992v1-abstract-full').style.display = 'none'; document.getElementById('2410.23992v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> <p class="comments is-size-7"> <span class="has-text-black-bis has-text-weight-semibold">Comments:</span> <span class="has-text-grey-dark mathjax">Accepted by NeurIPS, 21 pages, and 8 figures</span> </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23907">arXiv:2410.23907</a> <span> [<a href="https://arxiv.org/pdf/2410.23907">pdf</a>, <a href="https://arxiv.org/format/2410.23907">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Computer Vision and Pattern Recognition">cs.CV</span> </div> </div> <p class="title is-5 mathjax"> IP-MOT: Instance Prompt Learning for Cross-Domain Multi-Object Tracking </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Luo%2C+R">Run Luo</a>, <a href="/search/cs?searchtype=author&query=Song%2C+Z">Zikai Song</a>, <a href="/search/cs?searchtype=author&query=Chen%2C+L">Longze Chen</a>, <a href="/search/cs?searchtype=author&query=Li%2C+Y">Yunshui Li</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+M">Min Yang</a>, <a href="/search/cs?searchtype=author&query=Yang%2C+W">Wei Yang</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23907v1-abstract-short" style="display: inline;"> Multi-Object Tracking (MOT) aims to associate multiple objects across video frames and is a challenging vision task due to inherent complexities in the tracking environment. Most existing approaches train and track within a single domain, resulting in a lack of cross-domain generalizability to data from other domains. While several works have introduced natural language representation to bridge th… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23907v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23907v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23907v1-abstract-full" style="display: none;"> Multi-Object Tracking (MOT) aims to associate multiple objects across video frames and is a challenging vision task due to inherent complexities in the tracking environment. Most existing approaches train and track within a single domain, resulting in a lack of cross-domain generalizability to data from other domains. While several works have introduced natural language representation to bridge the domain gap in visual tracking, these textual descriptions often provide too high-level a view and fail to distinguish various instances within the same class. In this paper, we address this limitation by developing IP-MOT, an end-to-end transformer model for MOT that operates without concrete textual descriptions. Our approach is underpinned by two key innovations: Firstly, leveraging a pre-trained vision-language model, we obtain instance-level pseudo textual descriptions via prompt-tuning, which are invariant across different tracking scenes; Secondly, we introduce a query-balanced strategy, augmented by knowledge distillation, to further boost the generalization capabilities of our model. Extensive experiments conducted on three widely used MOT benchmarks, including MOT17, MOT20, and DanceTrack, demonstrate that our approach not only achieves competitive performance on same-domain data compared to state-of-the-art models but also significantly improves the performance of query-based trackers by large margins for cross-domain inputs. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23907v1-abstract-full').style.display = 'none'; document.getElementById('2410.23907v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 30 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> <li class="arxiv-result"> <div class="is-marginless"> <p class="list-title is-inline-block"><a href="https://arxiv.org/abs/2410.23875">arXiv:2410.23875</a> <span> [<a href="https://arxiv.org/pdf/2410.23875">pdf</a>, <a href="https://arxiv.org/format/2410.23875">other</a>] </span> </p> <div class="tags is-inline-block"> <span class="tag is-small is-link tooltip is-tooltip-top" data-tooltip="Artificial Intelligence">cs.AI</span> </div> </div> <p class="title is-5 mathjax"> Plan-on-Graph: Self-Correcting Adaptive Planning of Large Language Model on Knowledge Graphs </p> <p class="authors"> <span class="search-hit">Authors:</span> <a href="/search/cs?searchtype=author&query=Chen%2C+L">Liyi Chen</a>, <a href="/search/cs?searchtype=author&query=Tong%2C+P">Panrong Tong</a>, <a href="/search/cs?searchtype=author&query=Jin%2C+Z">Zhongming Jin</a>, <a href="/search/cs?searchtype=author&query=Sun%2C+Y">Ying Sun</a>, <a href="/search/cs?searchtype=author&query=Ye%2C+J">Jieping Ye</a>, <a href="/search/cs?searchtype=author&query=Xiong%2C+H">Hui Xiong</a> </p> <p class="abstract mathjax"> <span class="has-text-black-bis has-text-weight-semibold">Abstract</span>: <span class="abstract-short has-text-grey-dark mathjax" id="2410.23875v1-abstract-short" style="display: inline;"> Large Language Models (LLMs) have shown remarkable reasoning capabilities on complex tasks, but they still suffer from out-of-date knowledge, hallucinations, and opaque decision-making. In contrast, Knowledge Graphs (KGs) can provide explicit and editable knowledge for LLMs to alleviate these issues. Existing paradigm of KG-augmented LLM manually predefines the breadth of exploration space and req… <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23875v1-abstract-full').style.display = 'inline'; document.getElementById('2410.23875v1-abstract-short').style.display = 'none';">▽ More</a> </span> <span class="abstract-full has-text-grey-dark mathjax" id="2410.23875v1-abstract-full" style="display: none;"> Large Language Models (LLMs) have shown remarkable reasoning capabilities on complex tasks, but they still suffer from out-of-date knowledge, hallucinations, and opaque decision-making. In contrast, Knowledge Graphs (KGs) can provide explicit and editable knowledge for LLMs to alleviate these issues. Existing paradigm of KG-augmented LLM manually predefines the breadth of exploration space and requires flawless navigation in KGs. However, this paradigm cannot adaptively explore reasoning paths in KGs based on the question semantics and self-correct erroneous reasoning paths, resulting in a bottleneck in efficiency and effect. To address these limitations, we propose a novel self-correcting adaptive planning paradigm for KG-augmented LLM named Plan-on-Graph (PoG), which first decomposes the question into several sub-objectives and then repeats the process of adaptively exploring reasoning paths, updating memory, and reflecting on the need to self-correct erroneous reasoning paths until arriving at the answer. Specifically, three important mechanisms of Guidance, Memory, and Reflection are designed to work together, to guarantee the adaptive breadth of self-correcting planning for graph reasoning. Finally, extensive experiments on three real-world datasets demonstrate the effectiveness and efficiency of PoG. <a class="is-size-7" style="white-space: nowrap;" onclick="document.getElementById('2410.23875v1-abstract-full').style.display = 'none'; document.getElementById('2410.23875v1-abstract-short').style.display = 'inline';">△ Less</a> </span> </p> <p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span> 31 October, 2024; <span class="has-text-black-bis has-text-weight-semibold">originally announced</span> October 2024. </p> </li> </ol> <nav class="pagination is-small is-centered breathe-horizontal" role="navigation" aria-label="pagination"> <a href="" class="pagination-previous is-invisible">Previous </a> <a href="/search/?searchtype=author&query=Chen%2C+L&start=50" class="pagination-next" >Next </a> <ul class="pagination-list"> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=0" class="pagination-link is-current" aria-label="Goto page 1">1 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=50" class="pagination-link " aria-label="Page 2" aria-current="page">2 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=100" class="pagination-link " aria-label="Page 3" aria-current="page">3 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=150" class="pagination-link " aria-label="Page 4" aria-current="page">4 </a> </li> <li> <a href="/search/?searchtype=author&query=Chen%2C+L&start=200" class="pagination-link " aria-label="Page 5" aria-current="page">5 </a> </li> <li><span class="pagination-ellipsis">…</span></li> </ul> </nav> <div class="is-hidden-tablet"> <!-- feedback for mobile only --> <span class="help" style="display: inline-block;"><a href="https://github.com/arXiv/arxiv-search/releases">Search v0.5.6 released 2020-02-24</a> </span> </div> </div> </main> <footer> <div class="columns is-desktop" role="navigation" aria-label="Secondary"> <!-- MetaColumn 1 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul class="nav-spaced"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- end MetaColumn 1 --> <!-- MetaColumn 2 --> <div class="column"> <div class="columns"> <div class="column"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul class="nav-spaced"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> </div> </footer> <script src="https://static.arxiv.org/static/base/1.0.0a5/js/member_acknowledgement.js"></script> </body> </html>