CINXE.COM
Databases
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"> <head> <title>Databases </title> <meta name="viewport" content="width=device-width, initial-scale=1"> <link rel="apple-touch-icon" sizes="180x180" href="/static/browse/0.3.4/images/icons/apple-touch-icon.png"> <link rel="icon" type="image/png" sizes="32x32" href="/static/browse/0.3.4/images/icons/favicon-32x32.png"> <link rel="icon" type="image/png" sizes="16x16" href="/static/browse/0.3.4/images/icons/favicon-16x16.png"> <link rel="manifest" href="/static/browse/0.3.4/images/icons/site.webmanifest"> <link rel="mask-icon" href="/static/browse/0.3.4/images/icons/safari-pinned-tab.svg" color="#5bbad5"> <meta name="msapplication-TileColor" content="#da532c"> <meta name="theme-color" content="#ffffff"> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/arXiv.css?v=20240822" /> <link rel="stylesheet" type="text/css" media="print" href="/static/browse/0.3.4/css/arXiv-print.css?v=20200611" /> <link rel="stylesheet" type="text/css" media="screen" href="/static/browse/0.3.4/css/browse_search.css" /> <script language="javascript" src="/static/browse/0.3.4/js/accordion.js" /></script> <script src="/static/browse/0.3.4/js/mathjaxToggle.min.js" type="text/javascript"></script> <script type="text/javascript" language="javascript">mathjaxToggle();</script> </head> <body class="with-cu-identity"> <div class="flex-wrap-footer"> <header> <a href="#content" class="is-sr-only">Skip to main content</a> <!-- start desktop header --> <div class="columns is-vcentered is-hidden-mobile" id="cu-identity"> <div class="column" id="cu-logo"> <a href="https://www.cornell.edu/"><img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University" /></a> </div><div class="column" id="support-ack"> <span id="support-ack-url">We gratefully acknowledge support from the Simons Foundation, <a href="https://info.arxiv.org/about/ourmembers.html">member institutions</a>, and all contributors.</span> <a href="https://info.arxiv.org/about/donate.html" class="btn-header-donate">Donate</a> </div> </div> <div id="header" class="is-hidden-mobile"> <a aria-hidden="true" tabindex="-1" href="/IgnoreMe"></a> <div class="header-breadcrumbs"> <a href="/"><img src="/static/browse/0.3.4/images/arxiv-logo-one-color-white.svg" alt="arxiv logo" style="height:40px;"/></a> <span>></span> <a href="/list/cs.DB/recent">cs.DB</a> </div> <div class="search-block level-right"> <form class="level-item mini-search" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <div class="control"> <input class="input is-small" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <p class="help"><a href="https://info.arxiv.org/help">Help</a> | <a href="https://arxiv.org/search/advanced">Advanced Search</a></p> </div> <div class="control"> <div class="select is-small"> <select name="searchtype" aria-label="Field to search"> <option value="all" selected="selected">All fields</option> <option value="title">Title</option> <option value="author">Author</option> <option value="abstract">Abstract</option> <option value="comments">Comments</option> <option value="journal_ref">Journal reference</option> <option value="acm_class">ACM classification</option> <option value="msc_class">MSC classification</option> <option value="report_num">Report number</option> <option value="paper_id">arXiv identifier</option> <option value="doi">DOI</option> <option value="orcid">ORCID</option> <option value="author_id">arXiv author ID</option> <option value="help">Help pages</option> <option value="full_text">Full text</option> </select> </div> </div> <input type="hidden" name="source" value="header"> <button class="button is-small is-cul-darker">Search</button> </div> </form> </div> </div><!-- /end desktop header --> <div class="mobile-header"> <div class="columns is-mobile"> <div class="column logo-arxiv"><a href="https://arxiv.org/"><img src="/static/browse/0.3.4/images/arxiv-logomark-small-white.svg" alt="arXiv logo" style="height:60px;" /></a></div> <div class="column logo-cornell"><a href="https://www.cornell.edu/"> <picture> <source media="(min-width: 501px)" srcset="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg 400w" sizes="400w" /> <source srcset="/static/browse/0.3.4/images/icons/cu/cornell_seal_simple_black.svg 2x" /> <img src="/static/browse/0.3.4/images/icons/cu/cornell-reduced-white-SMALL.svg" alt="Cornell University Logo" /> </picture> </a></div> <div class="column nav" id="toggle-container" role="menubar"> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-white"><title>open search</title><path d="M505 442.7L405.3 343c-4.5-4.5-10.6-7-17-7H372c27.6-35.3 44-79.7 44-128C416 93.1 322.9 0 208 0S0 93.1 0 208s93.1 208 208 208c48.3 0 92.7-16.4 128-44v16.3c0 6.4 2.5 12.5 7 17l99.7 99.7c9.4 9.4 24.6 9.4 33.9 0l28.3-28.3c9.4-9.4 9.4-24.6.1-34zM208 336c-70.7 0-128-57.2-128-128 0-70.7 57.2-128 128-128 70.7 0 128 57.2 128 128 0 70.7-57.2 128-128 128z"/></svg></button> <div class="mobile-toggle-block toggle-target"> <form class="mobile-search-form" method="GET" action="https://arxiv.org/search"> <div class="field has-addons"> <input class="input" type="text" name="query" placeholder="Search..." aria-label="Search term or terms" /> <input type="hidden" name="source" value="header"> <input type="hidden" name="searchtype" value="all"> <button class="button">GO</button> </div> </form> </div> <button class="toggle-control"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-white" role="menu"><title>open navigation menu</title><path d="M16 132h416c8.837 0 16-7.163 16-16V76c0-8.837-7.163-16-16-16H16C7.163 60 0 67.163 0 76v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16zm0 160h416c8.837 0 16-7.163 16-16v-40c0-8.837-7.163-16-16-16H16c-8.837 0-16 7.163-16 16v40c0 8.837 7.163 16 16 16z"/ ></svg></button> <div class="mobile-toggle-block toggle-target"> <nav class="mobile-menu" aria-labelledby="mobilemenulabel"> <h2 id="mobilemenulabel">quick links</h2> <ul> <li><a href="https://arxiv.org/login">Login</a></li> <li><a href="https://info.arxiv.org/help">Help Pages</a></li> <li><a href="https://info.arxiv.org/about">About</a></li> </ul> </nav> </div> </div> </div> </div><!-- /end mobile-header --> </header> <main> <div id="content"> <div id='content-inner'> <div id='dlpage'> <h1>Databases</h1> <ul> <li><a href="#item0">New submissions</a></li> <li><a href="#item5">Replacements</a></li> </ul> <p>See <a id="recent-cs.DB" aria-labelledby="recent-cs.DB" href="/list/cs.DB/recent">recent</a> articles</p> <h3>Showing new listings for Friday, 22 November 2024</h3> <div class='paging'>Total of 8 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.DB/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> <dl id='articles'> <h3>New submissions (showing 4 of 4 entries)</h3> <dt> <a name='item1'>[1]</a> <a href ="/abs/2411.13704" title="Abstract" id="2411.13704"> arXiv:2411.13704 </a> [<a href="/pdf/2411.13704" title="Download PDF" id="pdf-2411.13704" aria-labelledby="pdf-2411.13704">pdf</a>, <a href="https://arxiv.org/html/2411.13704v1" title="View HTML" id="html-2411.13704" aria-labelledby="html-2411.13704" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.13704" title="Other formats" id="oth-2411.13704" aria-labelledby="oth-2411.13704">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Towards Query Optimizer as a Service (QOaaS) in a Unified LakeHouse Ecosystem: Can One QO Rule Them All? </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Alotaibi,+R">Rana Alotaibi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Tian,+Y">Yuanyuan Tian</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Grafberger,+S">Stefan Grafberger</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Camacho-Rodr%C3%ADguez,+J">Jes煤s Camacho-Rodr铆guez</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bruno,+N">Nicolas Bruno</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kroth,+B">Brian Kroth</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Matusevych,+S">Sergiy Matusevych</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Agrawal,+A">Ashvin Agrawal</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Behera,+M">Mahesh Behera</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gosalia,+A">Ashit Gosalia</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Galindo-Legaria,+C">Cesar Galindo-Legaria</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Joshi,+M">Milind Joshi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Potocnik,+M">Milan Potocnik</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sezgin,+B">Beysim Sezgin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Li,+X">Xiaoyu Li</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Curino,+C">Carlo Curino</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span> </div> <p class='mathjax'> Customer demand, regulatory pressure, and engineering efficiency are the driving forces behind the industry-wide trend of moving from siloed engines and services that are optimized in isolation to highly integrated solutions. This is confirmed by the wide adoption of open formats, shared component libraries, and the meteoric success of integrated data lake experiences such as Microsoft Fabric. <br>In this paper, we study the implications of this trend to Query Optimizer (QO) and discuss our experience of building Calcite and extending Cascades into QO components of Microsoft SQL Server, Fabric Data Warehouse (DW), and SCOPE. We weigh the pros and cons of a drastic change in direction: moving from bespoke QOs or library-sharing (脿 la Calcite) to rewriting the QO stack and fully embracing Query Optimizer as a Service (QOaaS). We report on some early successes and stumbles as we explore these ideas with prototypes compatible with Fabric DW and Spark. The benefits include centralized workload-level optimizations, multi-engine federation, and accelerated feature creation, but the challenges are equally daunting. We plan to engage CIDR audience in a debate on this exciting topic. </p> </div> </dd> <dt> <a name='item2'>[2]</a> <a href ="/abs/2411.14277" title="Abstract" id="2411.14277"> arXiv:2411.14277 </a> [<a href="/pdf/2411.14277" title="Download PDF" id="pdf-2411.14277" aria-labelledby="pdf-2411.14277">pdf</a>, <a href="https://arxiv.org/html/2411.14277v1" title="View HTML" id="html-2411.14277" aria-labelledby="html-2411.14277" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14277" title="Other formats" id="oth-2411.14277" aria-labelledby="oth-2411.14277">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Neuro-Symbolic Query Optimization in Knowledge Graphs </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Acosta,+M">Maribel Acosta</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Qin,+C">Chang Qin</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schwabe,+T">Tim Schwabe</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span>; Artificial Intelligence (cs.AI) </div> <p class='mathjax'> This chapter delves into the emerging field of neuro-symbolic query optimization for knowledge graphs (KGs), presenting a comprehensive exploration of how neural and symbolic techniques can be integrated to enhance query processing. Traditional query optimizers in knowledge graphs rely heavily on symbolic methods, utilizing dataset summaries, statistics, and cost models to select efficient execution plans. However, these approaches often suffer from misestimations and inaccuracies, particularly when dealing with complex queries or large-scale datasets. Recent advancements have introduced neural models, which capture non-linear aspects of query optimization, offering promising alternatives to purely symbolic methods. In this chapter, we introduce neuro-symbolic query optimizers, a novel approach that combines the strengths of symbolic reasoning with the adaptability of neural computation. We discuss the architecture of these hybrid systems, highlighting the interplay between neural and symbolic components to improve the optimizer's ability to navigate the search space and produce efficient execution plans. Additionally, the chapter reviews existing neural components tailored for optimizing queries over knowledge graphs and examines the limitations and challenges in deploying neuro-symbolic query optimizers in real-world environments. </p> </div> </dd> <dt> <a name='item3'>[3]</a> <a href ="/abs/2411.14330" title="Abstract" id="2411.14330"> arXiv:2411.14330 </a> [<a href="/pdf/2411.14330" title="Download PDF" id="pdf-2411.14330" aria-labelledby="pdf-2411.14330">pdf</a>, <a href="https://arxiv.org/html/2411.14330v1" title="View HTML" id="html-2411.14330" aria-labelledby="html-2411.14330" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14330" title="Other formats" id="oth-2411.14330" aria-labelledby="oth-2411.14330">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Datalog with First-Class Facts </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Gilray,+T">Thomas Gilray</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sahebolamri,+A">Arash Sahebolamri</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Sun,+Y">Yihao Sun</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kunapaneni,+S">Sowmith Kunapaneni</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kumar,+S">Sidharth Kumar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Micinski,+K">Kristopher Micinski</a></div> <div class='list-comments mathjax'><span class='descriptor'>Comments:</span> arXiv admin note: text overlap with <a href="https://arxiv.org/abs/2211.11573" data-arxiv-id="2211.11573" class="link-https">arXiv:2211.11573</a> </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span>; Programming Languages (cs.PL) </div> <p class='mathjax'> Datalog is a popular logic programming language for deductive reasoning tasks in a wide array of applications, including business analytics, program analysis, and ontological reasoning. However, Datalog's restriction to flat facts over atomic constants leads to challenges in working with tree-structured data, such as derivation trees or abstract syntax trees. To ameliorate Datalog's restrictions, popular extensions of Datalog support features such as existential quantification in rule heads (Datalog$^\pm$, Datalog$^\exists$) or algebraic data types (Souffl茅). Unfortunately, these are imperfect solutions for reasoning over structured and recursive data types, with general existentials leading to complex implementations requiring unification, and ADTs unable to trigger rule evaluation and failing to support efficient indexing. <br>We present DL$^{\exists!}$, a Datalog with first-class facts, wherein every fact is identified with a Skolem term unique to the fact. We show that this restriction offers an attractive price point for Datalog-based reasoning over tree-shaped data, demonstrating its application to databases, artificial intelligence, and programming languages. We implemented DL$^{\exists!}$ as a system \slog{}, which leverages the uniqueness restriction of DL$^{\exists!}$ to enable a communication-avoiding, massively-parallel implementation built on MPI. We show that Slog outperforms leading systems (Nemo, Vlog, RDFox, and Souffl茅) on a variety of benchmarks, with the potential to scale to thousands of threads. </p> </div> </dd> <dt> <a name='item4'>[4]</a> <a href ="/abs/2411.14331" title="Abstract" id="2411.14331"> arXiv:2411.14331 </a> [<a href="/pdf/2411.14331" title="Download PDF" id="pdf-2411.14331" aria-labelledby="pdf-2411.14331">pdf</a>, <a href="https://arxiv.org/html/2411.14331v1" title="View HTML" id="html-2411.14331" aria-labelledby="html-2411.14331" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2411.14331" title="Other formats" id="oth-2411.14331" aria-labelledby="oth-2411.14331">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Data Formats in Analytical DBMSs: Performance Trade-offs and Future Directions </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Liu,+C">Chunwei Liu</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Pavlenko,+A">Anna Pavlenko</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Interlandi,+M">Matteo Interlandi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Haynes,+B">Brandon Haynes</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span> </div> <p class='mathjax'> This paper evaluates the suitability of Apache Arrow, Parquet, and ORC as formats for subsumption in an analytical DBMS. We systematically identify and explore the high-level features that are important to support efficient querying in modern OLAP DBMSs and evaluate the ability of each format to support these features. We find that each format has trade-offs that make it more or less suitable for use as a format in a DBMS and identify opportunities to more holistically co-design a unified in-memory and on-disk data representation. Notably, for certain popular machine learning tasks, none of these formats perform optimally, highlighting significant opportunities for advancing format design. Our hope is that this study can be used as a guide for system developers designing and using these formats, as well as provide the community with directions to pursue for improving these common open formats. </p> </div> </dd> </dl> <dl id='articles'> <h3>Replacement submissions (showing 4 of 4 entries)</h3> <dt> <a name='item5'>[5]</a> <a href ="/abs/2303.05327" title="Abstract" id="2303.05327"> arXiv:2303.05327 </a> (replaced) [<a href="/pdf/2303.05327" title="Download PDF" id="pdf-2303.05327" aria-labelledby="pdf-2303.05327">pdf</a>, <a href="https://arxiv.org/html/2303.05327v2" title="View HTML" id="html-2303.05327" aria-labelledby="html-2303.05327" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2303.05327" title="Other formats" id="oth-2303.05327" aria-labelledby="oth-2303.05327">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Direct Access for Answers to Conjunctive Queries with Aggregation </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Eldar,+I">Idan Eldar</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Carmeli,+N">Nofar Carmeli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Kimelfeld,+B">Benny Kimelfeld</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span>; Data Structures and Algorithms (cs.DS) </div> <p class='mathjax'> We study the fine-grained complexity of conjunctive queries with grouping and aggregation. For common aggregate functions (e.g., min, max, count, sum), such a query can be phrased as an ordinary conjunctive query over a database annotated with a suitable commutative semiring. We investigate the ability to evaluate such queries by constructing in loglinear time a data structure that provides logarithmic-time direct access to the answers ordered by a given lexicographic order. This task is nontrivial since the number of answers might be larger than loglinear in the size of the input, so the data structure needs to provide a compact representation of the space of answers. In the absence of aggregation and annotation, past research established a sufficient tractability condition on queries and orders. For queries without self-joins, this condition is not just sufficient, but also necessary (under conventional lower-bound assumptions in fine-grained complexity). <br>We show that all past results continue to hold for annotated databases, assuming that the annotation itself does not participate in the lexicographic order. Yet, past algorithms do not apply to the count-distinct aggregation, which has no efficient representation as a commutative semiring; for this aggregation, we establish the corresponding tractability condition. We then show how the complexity of the problem changes when we include the aggregate and annotation value in the order. We also study the impact of having all relations but one annotated by the multiplicative identity (one), as happens when we translate aggregate queries into semiring annotations, and having a semiring with an idempotent addition, such as the case of min, max, and count-distinct over a logarithmic-size domain. </p> </div> </dd> <dt> <a name='item6'>[6]</a> <a href ="/abs/2312.11122" title="Abstract" id="2312.11122"> arXiv:2312.11122 </a> (replaced) [<a href="/pdf/2312.11122" title="Download PDF" id="pdf-2312.11122" aria-labelledby="pdf-2312.11122">pdf</a>, <a href="https://arxiv.org/html/2312.11122v3" title="View HTML" id="html-2312.11122" aria-labelledby="html-2312.11122" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2312.11122" title="Other formats" id="oth-2312.11122" aria-labelledby="oth-2312.11122">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Evaluation of Dataframe Libraries for Data Preparation on a Single Machine </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Mozzillo,+A">Angelo Mozzillo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zecchini,+L">Luca Zecchini</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Gagliardelli,+L">Luca Gagliardelli</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Aslam,+A">Adeel Aslam</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Bergamaschi,+S">Sonia Bergamaschi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Simonini,+G">Giovanni Simonini</a></div> <div class='list-journal-ref'><span class='descriptor'>Journal-ref:</span> Proceedings 28th International Conference on Extending Database Technology, EDBT 2025, Barcelona, Spain, March 25-28, 2025 (pp. 337-349) </div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Databases (cs.DB)</span> </div> <p class='mathjax'> Data preparation is a trial-and-error process that typically involves countless iterations over the data to define the best pipeline of operators for a given task. With tabular data, practitioners often perform that burdensome activity on local machines by writing ad hoc scripts with libraries based on the Pandas dataframe API and testing them on samples of the entire dataset-the faster the library, the less idle time its users have. <br>In this paper, we evaluate the most popular Python dataframe libraries in general data preparation use cases to assess how they perform on a single machine. To do so, we employ 4 real-world datasets with heterogeneous features, covering a variety of scenarios, and the TPC-H benchmark. The insights gained with this experimentation are useful to data scientists who need to choose which of the dataframe libraries best suits their data preparation task at hand. <br>In a nutshell, we found that: for small datasets, Pandas consistently proves to be the best choice with the richest API; when data fits in RAM and there is no need for complete compatibility with Pandas API, Polars is the go-to choice thanks to its in-memory execution and query optimizations; when a GPU is available, CuDF often yields the best performance, while for very large datasets that cannot fit in the GPU memory and RAM, PySpark (thanks to a multithread execution and a query optimizer) proves to be the best option. </p> </div> </dd> <dt> <a name='item7'>[7]</a> <a href ="/abs/2405.07460" title="Abstract" id="2405.07460"> arXiv:2405.07460 </a> (replaced) [<a href="/pdf/2405.07460" title="Download PDF" id="pdf-2405.07460" aria-labelledby="pdf-2405.07460">pdf</a>, <a href="https://arxiv.org/html/2405.07460v4" title="View HTML" id="html-2405.07460" aria-labelledby="html-2405.07460" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2405.07460" title="Other formats" id="oth-2405.07460" aria-labelledby="oth-2405.07460">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> HoneyBee: A Scalable Modular Framework for Creating Multimodal Oncology Datasets with Foundational Embedding Models </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Tripathi,+A">Aakash Tripathi</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Waqas,+A">Asim Waqas</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Schabath,+M+B">Matthew B. Schabath</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Yilmaz,+Y">Yasin Yilmaz</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Rasool,+G">Ghulam Rasool</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Machine Learning (cs.LG)</span>; Artificial Intelligence (cs.AI); Databases (cs.DB) </div> <p class='mathjax'> Developing accurate machine learning models for oncology requires large-scale, high-quality multimodal datasets. However, creating such datasets remains challenging due to the complexity and heterogeneity of medical data. To address this challenge, we introduce HoneyBee, a scalable modular framework for building multimodal oncology datasets that leverages foundation models to generate representative embeddings. HoneyBee integrates various data modalities, including clinical diagnostic and pathology imaging data, medical notes, reports, records, and molecular data. It employs data preprocessing techniques and foundation models to generate embeddings that capture the essential features and relationships within the raw medical data. The generated embeddings are stored in a structured format using Hugging Face datasets and PyTorch dataloaders for accessibility. Vector databases enable efficient querying and retrieval for machine learning applications. We demonstrate the effectiveness of HoneyBee through experiments assessing the quality and representativeness of these embeddings. The framework is designed to be extensible to other medical domains and aims to accelerate oncology research by providing high-quality, machine learning-ready datasets. HoneyBee is an ongoing open-source effort, and the code, datasets, and models are available at the project repository. </p> </div> </dd> <dt> <a name='item8'>[8]</a> <a href ="/abs/2408.04197" title="Abstract" id="2408.04197"> arXiv:2408.04197 </a> (replaced) [<a href="/pdf/2408.04197" title="Download PDF" id="pdf-2408.04197" aria-labelledby="pdf-2408.04197">pdf</a>, <a href="https://arxiv.org/html/2408.04197v2" title="View HTML" id="html-2408.04197" aria-labelledby="html-2408.04197" rel="noopener noreferrer" target="_blank">html</a>, <a href="/format/2408.04197" title="Other formats" id="oth-2408.04197" aria-labelledby="oth-2408.04197">other</a>] </dt> <dd> <div class='meta'> <div class='list-title mathjax'><span class='descriptor'>Title:</span> Pairwise Judgment Formulation for Semantic Embedding Model in Web Search </div> <div class='list-authors'><a href="https://arxiv.org/search/cs?searchtype=author&query=Hong,+M">Mengze Hong</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Ng,+W">Wailing Ng</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Guo,+Z">Zichang Guo</a>, <a href="https://arxiv.org/search/cs?searchtype=author&query=Zhang,+C+J">Chen Jason Zhang</a></div> <div class='list-subjects'><span class='descriptor'>Subjects:</span> <span class="primary-subject">Information Retrieval (cs.IR)</span>; Artificial Intelligence (cs.AI); Databases (cs.DB) </div> <p class='mathjax'> Semantic Embedding Model (SEM), a neural network-based Siamese architecture, is gaining momentum in information retrieval and natural language processing. In order to train SEM in a supervised fashion for Web search, the search engine query log is typically utilized to automatically formulate pairwise judgments as training data. Despite the growing application of semantic embeddings in the search engine industry, little work has been done on formulating effective pairwise judgments for training SEM. In this paper, we make the first in-depth investigation of a wide range of strategies for generating pairwise judgments for SEM. An interesting (perhaps surprising) discovery reveals that the conventional pairwise judgment formulation strategy wildly used in the field of pairwise Learning-to-Rank (LTR) is not necessarily effective for training SEM. Through a large-scale empirical study based on query logs and click-through activities from a major commercial search engine, we demonstrate the effective strategies for SEM and highlight the advantages of a hybrid heuristic (i.e., Clicked > Non-Clicked) in comparison to the atomic heuristics (e.g., Clicked > Skipped) in LTR. We conclude with best practices for training SEM and offer promising insights for future research. </p> </div> </dd> </dl> <div class='paging'>Total of 8 entries </div> <div class='morefewer'>Showing up to 2000 entries per page: <a href=/list/cs.DB/new?skip=0&show=1000 rel="nofollow"> fewer</a> | <span style="color: #454545">more</span> | <span style="color: #454545">all</span> </div> </div> </div> </div> </main> <footer style="clear: both;"> <div class="columns is-desktop" role="navigation" aria-label="Secondary" style="margin: -0.75em -0.75em 0.75em -0.75em"> <!-- Macro-Column 1 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/about">About</a></li> <li><a href="https://info.arxiv.org/help">Help</a></li> </ul> </div> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>contact arXiv</title><desc>Click here to contact arXiv</desc><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg> <a href="https://info.arxiv.org/help/contact.html"> Contact</a> </li> <li> <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><title>subscribe to arXiv mailings</title><desc>Click here to subscribe</desc><path d="M476 3.2L12.5 270.6c-18.1 10.4-15.8 35.6 2.2 43.2L121 358.4l287.3-253.2c5.5-4.9 13.3 2.6 8.6 8.3L176 407v80.5c0 23.6 28.5 32.9 42.5 15.8L282 426l124.6 52.2c14.2 6 30.4-2.9 33-18.2l72-432C515 7.8 493.3-6.8 476 3.2z"/></svg> <a href="https://info.arxiv.org/help/subscribe"> Subscribe</a> </li> </ul> </div> </div> </div> <!-- End Macro-Column 1 --> <!-- Macro-Column 2 --> <div class="column" style="padding: 0;"> <div class="columns"> <div class="column"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/license/index.html">Copyright</a></li> <li><a href="https://info.arxiv.org/help/policies/privacy_policy.html">Privacy Policy</a></li> </ul> </div> <div class="column sorry-app-links"> <ul style="list-style: none; line-height: 2;"> <li><a href="https://info.arxiv.org/help/web_accessibility.html">Web Accessibility Assistance</a></li> <li> <p class="help"> <a class="a11y-main-link" href="https://status.arxiv.org" target="_blank">arXiv Operational Status <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 512" class="icon filter-dark_grey" role="presentation"><path d="M224.3 273l-136 136c-9.4 9.4-24.6 9.4-33.9 0l-22.6-22.6c-9.4-9.4-9.4-24.6 0-33.9l96.4-96.4-96.4-96.4c-9.4-9.4-9.4-24.6 0-33.9L54.3 103c9.4-9.4 24.6-9.4 33.9 0l136 136c9.5 9.4 9.5 24.6.1 34z"/></svg></a><br> Get status notifications via <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/email/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" class="icon filter-black" role="presentation"><path d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"/></svg>email</a> or <a class="is-link" href="https://subscribe.sorryapp.com/24846f03/slack/new" target="_blank"><svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 448 512" class="icon filter-black" role="presentation"><path d="M94.12 315.1c0 25.9-21.16 47.06-47.06 47.06S0 341 0 315.1c0-25.9 21.16-47.06 47.06-47.06h47.06v47.06zm23.72 0c0-25.9 21.16-47.06 47.06-47.06s47.06 21.16 47.06 47.06v117.84c0 25.9-21.16 47.06-47.06 47.06s-47.06-21.16-47.06-47.06V315.1zm47.06-188.98c-25.9 0-47.06-21.16-47.06-47.06S139 32 164.9 32s47.06 21.16 47.06 47.06v47.06H164.9zm0 23.72c25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06H47.06C21.16 243.96 0 222.8 0 196.9s21.16-47.06 47.06-47.06H164.9zm188.98 47.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06s-21.16 47.06-47.06 47.06h-47.06V196.9zm-23.72 0c0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06V79.06c0-25.9 21.16-47.06 47.06-47.06 25.9 0 47.06 21.16 47.06 47.06V196.9zM283.1 385.88c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06-25.9 0-47.06-21.16-47.06-47.06v-47.06h47.06zm0-23.72c-25.9 0-47.06-21.16-47.06-47.06 0-25.9 21.16-47.06 47.06-47.06h117.84c25.9 0 47.06 21.16 47.06 47.06 0 25.9-21.16 47.06-47.06 47.06H283.1z"/></svg>slack</a> </p> </li> </ul> </div> </div> </div> <!-- end MetaColumn 2 --> <!-- End Macro-Column 2 --> </div> </footer> </div> <script src="/static/base/1.0.1/js/member_acknowledgement.js"></script> </body> </html>