Skip to content

Commit e082f3c

Browse files
authored
Support pyarrow output for GenomicsDB queries (#51)
* Support pyarrow output for GenomicsDB queries * Modify message while installing devtoolset-11 gcc for cibuildwheel * Add arrow tests to test_genomicsdb_demo for benchmarking * Address review comments * Rename non_blocking to batching mode * Move back to using develop for builds
1 parent 7eaa14c commit e082f3c

13 files changed

+453
-14
lines changed

.github/scripts/install_prereqs.sh

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,15 @@ install_prereqs_for_centos7() {
112112
yum install -y -q which wget git &&
113113
yum install -y -q autoconf automake libtool unzip &&
114114
yum install -y -q cmake3 patch &&
115-
yum install -y -q perl perl-IPC-Cmd
115+
yum install -y -q perl perl-IPC-Cmd &&
116+
echo "Installing devtoolset-11-GCC for semaphore support for cibuildwheel manylinux2014 builds" &&
117+
yum install -y -q devtoolset-11-gcc devtoolset-11-gcc-c++ &&
118+
export CC=/opt/rh/devtoolset-11/root/usr/bin/gcc &&
119+
export CXX=/opt/rh/devtoolset-11/root/usr/bin/g++ &&
120+
ls -l /opt/rh/devtoolset-11/root/usr/bin/gcc &&
121+
ls -l /opt/rh/devtoolset-11/root/usr/bin/g++ &&
122+
echo "Installing devtoolset DONE"
123+
if [[ $? != 0 ]]; then exit 1; fi
116124
if [[ $1 == "release" ]]; then
117125
install_openssl3
118126
install_curl
@@ -203,7 +211,7 @@ if [[ $1 == "release" ]]; then
203211
echo "OSX_ARCH=$OSX_ARCH"
204212
CMAKE_ARCH_ARG="-DCMAKE_OSX_ARCHITECTURES=${OSX_ARCH}"
205213
fi
206-
cmake .. $CMAKE_ARCH_ARG -DPROTOBUF_ROOT_DIR=./protobuf -DGCSSDK_ROOT_DIR=./gcssdk -DAWSSDK_ROOT_DIR=./awssdk -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DCMAKE_PREFIX_PATH=$INSTALL_PREFIX -DBUILD_EXAMPLES=False -DDISABLE_MPI=True -DDISABLE_OPENMP=True -DDISABLE_TOOLS=True -DDISABLE_EXAMPLES=True -DDISABLE_TESTING=True -DOPENSSL_USE_STATIC_LIBS=True &&
214+
cmake .. $CMAKE_ARCH_ARG -DBUILD_NANOARROW=1 -DPROTOBUF_ROOT_DIR=./protobuf -DGCSSDK_ROOT_DIR=./gcssdk -DAWSSDK_ROOT_DIR=./awssdk -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX -DCMAKE_PREFIX_PATH=$INSTALL_PREFIX -DBUILD_EXAMPLES=False -DDISABLE_MPI=True -DDISABLE_OPENMP=True -DDISABLE_TOOLS=True -DDISABLE_EXAMPLES=True -DDISABLE_TESTING=True -DOPENSSL_USE_STATIC_LIBS=True &&
207215
make -j4 || rebuild && $SUDO make install &&
208216
popd && popd
209217
fi

.github/workflows/basic.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ env:
2020
jobs:
2121
native-build:
2222
name: GenomicsDB Build
23-
runs-on: ubuntu-20.04
23+
runs-on: ubuntu-22.04
2424
steps:
2525
- name: Cache Native GenomicsDB
2626
uses: actions/cache@v3
@@ -39,7 +39,7 @@ jobs:
3939
sudo apt-get update -q && sudo apt install -y libcurl4-openssl-dev curl
4040
git clone https://github.com/GenomicsDB/GenomicsDB.git -b $GENOMICSDB_BRANCH $GENOMICSDB_BUILD_DIR
4141
cd $GENOMICSDB_BUILD_DIR
42-
cmake -S . -B build -DBUILD_FOR_PYTHON=1 -DCMAKE_INSTALL_PREFIX=$GENOMICSDB_HOME
42+
cmake -S . -B build -DBUILD_FOR_PYTHON=1 -DCMAKE_INSTALL_PREFIX=$GENOMICSDB_HOME -DBUILD_NANOARROW=1
4343
cd build && make -j4 && make install
4444
cd $(dirname $GENOMICSDB_HOME)
4545
tar -cvf ${GENOMICSDB_TARBALL} $(basename $GENOMICSDB_HOME)
@@ -57,7 +57,7 @@ jobs:
5757
name: GenomicsDB Python Build
5858
needs: native-build
5959

60-
runs-on: ubuntu-20.04
60+
runs-on: ubuntu-22.04
6161
strategy:
6262
fail-fast: true
6363
matrix:

.github/workflows/release.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ jobs:
5151
CIBW_ENVIRONMENT_LINUX: >
5252
LD_LIBRARY_PATH=${{ env.INSTALL_PREFIX }}/lib64:${{ env.INSTALL_PREFIX }}/lib:$LD_LIBRARY_PATH
5353
PKG_CONFIG_PATH=${{ env.INSTALL_PREFIX }}/lib64/pkgconfig:${{ env.INSTALL_PREFIX }}/lib/pkgconfig
54+
CC=/opt/rh/devtoolset-11/root/usr/bin/gcc
55+
CXX=/opt/rh/devtoolset-11/root/usr/bin/g++
5456
CIBW_TEST_REQUIRES: pytest
5557

5658
- uses: actions/upload-artifact@v4

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ help:
3030
@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
3131

3232
clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
33+
rm -f src/genomicsdb.cc
3334
rm -fr genomicsdb/lib/lib*
3435
rm -fr genomicsdb/include/*.h
3536
rm -f genomicsdb/genomicsdb.cpython*so
@@ -96,7 +97,7 @@ dist: ## builds source and wheel package
9697
install: ## install the package to the active Python's site-packages
9798
python setup.py install --with-libs
9899

99-
install-dev: # install the package in place for debug purposes.
100+
install-dev: clean # install the package in place for debug purposes.
100101
# python -m pip install --upgrade pip
101102
# python -m pip install -r requirements_dev.txt
102103
python setup.py build_ext --inplace --with-libs

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
# Description: Minimum Package Dependencies
2626
#
2727

28-
numpy>=1.24.0
28+
numpy>=1.24.0,<2.0.0
2929
pandas>=2.1.0
3030
protobuf>=4.21.1
31+
pyarrow>=14.0.0

setup.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,15 @@
107107
with open(filename, "w") as file:
108108
file.write(replaced_contents)
109109

110+
os.environ["CFLAGS"] = "-DUSE_NANOARROW=1"
111+
110112
if "OSX_ARCH" in os.environ:
111113
os.environ["CFLAGS"] = "-arch " + os.environ["OSX_ARCH"]
112114

113-
115+
EXTRA_COMPILE_ARGS=["-std=c++20"]
116+
if "CXX" in os.environ and os.environ["CXX"].find("devtoolset-11") > 0:
117+
EXTRA_COMPILE_ARGS=["-std=c++2a"]
118+
114119
def run_cythonize(src):
115120
from Cython.Build.Dependencies import cythonize
116121

@@ -126,12 +131,13 @@ def run_cythonize(src):
126131
run_cythonize("src/genomicsdb.pyx"),
127132
"src/genomicsdb_processor.cpp",
128133
"src/genomicsdb_processor_columnar.cpp",
134+
"src/genomicsdb_arrow_utils.cpp",
129135
],
130136
libraries=["tiledbgenomicsdb"],
131137
library_dirs=[GENOMICSDB_LIB_DIR],
132138
runtime_library_dirs=rpath,
133139
extra_link_args=link_args,
134-
extra_compile_args=["-std=c++17"],
140+
extra_compile_args=EXTRA_COMPILE_ARGS,
135141
)
136142

137143
with open("README.md") as f:

src/genomicsdb.pxd

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,14 @@ cdef extern from "genomicsdb.h":
9191
string construct_json_output() except +
9292
pass
9393

94+
cdef cppclass ArrowVariantCallProcessor(GenomicsDBVariantCallProcessor):
95+
ArrowVariantCallProcessor() except +
96+
ArrowVariantCallProcessor(bool) except +
97+
void set_batching(bool)
98+
void* arrow_schema() except + nogil
99+
void* arrow_array() except + nogil
100+
pass
101+
94102
cdef enum query_config_type_t "GenomicsDB::query_config_type_t":
95103
GENOMICSDB_NONE "GenomicsDB::NONE",
96104
GENOMICSDB_JSON_FILE "GenomicsDB::JSON_FILE",
@@ -166,6 +174,78 @@ cdef extern from "genomicsdb_processor.h":
166174
object construct_data_frame() except +
167175
pass
168176

177+
# Apache Arrow C data structures so we do not have to import (nano)arrow_c
178+
179+
cdef struct ArrowSchema:
180+
const char* format
181+
const char* name
182+
const char* metadata
183+
int64_t flags
184+
int64_t n_children
185+
ArrowSchema** children
186+
ArrowSchema* dictionary
187+
void (*release)(ArrowSchema*)
188+
pass
189+
190+
cdef struct ArrowArray:
191+
int64_t length
192+
int64_t null_count
193+
int64_t offset
194+
int64_t n_buffers
195+
int64_t n_children;
196+
const void** buffers
197+
ArrowArray** children
198+
ArrowArray* dictionary
199+
void (*release)(ArrowArray*)
200+
pass
201+
202+
cdef enum ArrowType:
203+
NANOARROW_TYPE_UNINITIALIZED = 0
204+
NANOARROW_TYPE_NA = 1
205+
NANOARROW_TYPE_BOOL
206+
NANOARROW_TYPE_UINT8
207+
NANOARROW_TYPE_INT8
208+
NANOARROW_TYPE_UINT16
209+
NANOARROW_TYPE_INT16
210+
NANOARROW_TYPE_UINT32
211+
NANOARROW_TYPE_INT32
212+
NANOARROW_TYPE_UINT64
213+
NANOARROW_TYPE_INT64
214+
NANOARROW_TYPE_HALF_FLOAT
215+
NANOARROW_TYPE_FLOAT
216+
NANOARROW_TYPE_DOUBLE
217+
NANOARROW_TYPE_STRING
218+
NANOARROW_TYPE_BINARY
219+
NANOARROW_TYPE_FIXED_SIZE_BINARY
220+
NANOARROW_TYPE_DATE32
221+
NANOARROW_TYPE_DATE64
222+
NANOARROW_TYPE_TIMESTAMP
223+
NANOARROW_TYPE_TIME32
224+
NANOARROW_TYPE_TIME64
225+
NANOARROW_TYPE_INTERVAL_MONTHS
226+
NANOARROW_TYPE_INTERVAL_DAY_TIME
227+
NANOARROW_TYPE_DECIMAL128
228+
NANOARROW_TYPE_DECIMAL256
229+
NANOARROW_TYPE_LIST
230+
NANOARROW_TYPE_STRUCT
231+
NANOARROW_TYPE_SPARSE_UNION
232+
NANOARROW_TYPE_DENSE_UNION
233+
NANOARROW_TYPE_DICTIONARY
234+
NANOARROW_TYPE_MAP
235+
NANOARROW_TYPE_EXTENSION
236+
NANOARROW_TYPE_FIXED_SIZE_LIST
237+
NANOARROW_TYPE_DURATION
238+
NANOARROW_TYPE_LARGE_STRING
239+
NANOARROW_TYPE_LARGE_BINARY
240+
NANOARROW_TYPE_LARGE_LIST
241+
NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO
242+
243+
# Apache Arrow wrappers to (nano)arrow functionality via genomicsdb libraries
244+
245+
cdef void genomicsdb_cleanup_arrow_schema(void*)
246+
cdef void genomicsdb_cleanup_arrow_array(void*)
247+
cdef int genomicsdb_allocate_arrow_schema(ArrowSchema**, ArrowSchema*)
248+
169249
# Filesystem and other Utilities
170250
cdef extern from "genomicsdb_utils.h":
171251
cdef string c_version "genomicsdb::version"()

src/genomicsdb.pyx

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@
3131

3232
include "utils.pxi"
3333

34-
import tempfile
3534
import pandas
3635
import numpy as np
36+
import pyarrow as pa
37+
import tempfile
38+
import threading
3739
from enum import Enum
3840

3941
from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb
@@ -195,11 +197,16 @@ cdef class _GenomicsDB:
195197
row_ranges=None,
196198
query_protobuf: query_pb.QueryConfiguration=None,
197199
flatten_intervals=False,
198-
json_output=None):
200+
json_output=None,
201+
arrow_output=None,
202+
# batching only used with arrow_output
203+
batching=False):
199204
""" Query for variant calls from the GenomicsDB workspace using array, column_ranges and row_ranges for subsetting """
200205

201206
if json_output is not None:
202207
return self.query_variant_calls_json(array, column_ranges, row_ranges, query_protobuf, json_output);
208+
elif arrow_output is not None:
209+
return self.query_variant_calls_arrow(array, column_ranges, row_ranges, query_protobuf, batching);
203210
elif flatten_intervals is True:
204211
return self.query_variant_calls_columnar(array, column_ranges, row_ranges, query_protobuf)
205212
else:
@@ -305,6 +312,77 @@ cdef class _GenomicsDB:
305312
as_ranges(row_ranges))
306313

307314
return pandas.DataFrame(processor.construct_data_frame()).replace(np.nan, '').replace(-99999, '');
315+
316+
def query_variant_calls_arrow(self,
317+
array=None,
318+
column_ranges=None,
319+
row_ranges=None,
320+
query_protobuf: query_pb.QueryConfiguration=None,
321+
batching=False):
322+
""" Query for variant calls from the GenomicsDB workspace using array, column_ranges and row_ranges for subsetting """
323+
324+
cdef ArrowVariantCallProcessor processor
325+
326+
if batching:
327+
processor.set_batching(1)
328+
329+
def query_calls():
330+
if query_protobuf:
331+
if array or column_ranges or row_ranges:
332+
raise GenomicsDBException("Cannot specify query_protobuf and array/column_ranges/row_ranges together")
333+
configstring = as_protobuf_string(query_protobuf.SerializeToString())
334+
with nogil:
335+
self._genomicsdb.query_variant_calls(processor, configstring, GENOMICSDB_PROTOBUF_BINARY_STRING)
336+
elif array is None:
337+
configstring = as_string("")
338+
with nogil:
339+
self._genomicsdb.query_variant_calls(processor, configstring, GENOMICSDB_NONE)
340+
elif column_ranges is None:
341+
configstring = as_string(array)
342+
rows = scan_full()
343+
with nogil:
344+
self._genomicsdb.query_variant_calls(processor, configstring, rows)
345+
elif row_ranges is None:
346+
configstring = as_string(array)
347+
columns = as_ranges(column_ranges)
348+
with nogil:
349+
self._genomicsdb.query_variant_calls(processor, configstring, columns)
350+
else:
351+
configstring = as_string(array)
352+
columns = as_ranges(column_ranges)
353+
rows = as_ranges(row_ranges)
354+
with nogil:
355+
self._genomicsdb.query_variant_calls(processor, configstring,
356+
columns, rows)
357+
358+
if batching:
359+
query_thread = threading.Thread(target=query_calls)
360+
query_thread.start()
361+
else:
362+
query_calls()
363+
364+
cdef void* arrow_schema = processor.arrow_schema()
365+
if arrow_schema:
366+
schema_capsule = pycapsule_get_arrow_schema(arrow_schema)
367+
schema_obj = _ArrowSchemaWrapper._import_from_c_capsule(schema_capsule)
368+
schema = pa.schema(schema_obj.children_schema)
369+
yield schema.serialize().to_pybytes()
370+
else:
371+
raise GenomicsDBException("Failed to retrieve arrow schema for query_variant_calls()")
372+
373+
cdef void* arrow_array = NULL
374+
while True:
375+
arrow_array = processor.arrow_array()
376+
if arrow_array:
377+
array_capsule = pycapsule_get_arrow_array(arrow_array)
378+
array_obj = _ArrowArrayWrapper._import_from_c_capsule(schema_capsule, array_capsule)
379+
arrays = [pa.array(array_obj.child(i)) for i in range(array_obj.n_children)]
380+
yield pa.RecordBatch.from_arrays(arrays, schema=schema).serialize().to_pybytes()
381+
else:
382+
break
383+
384+
if batching:
385+
query_thread.join()
308386

309387
def to_vcf(self,
310388
array=None,

src/genomicsdb_arrow_utils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#include "genomicsdb.h"
2+
#include "genomicsdb_processor.h"
3+
4+
5+
void genomicsdb_cleanup_arrow_schema(void *schema) {
6+
return ArrowVariantCallProcessor::cleanup_schema(schema);
7+
}
8+
9+
void genomicsdb_cleanup_arrow_array(void *array) {
10+
return ArrowVariantCallProcessor::cleanup_array(array);
11+
}
12+
13+
int genomicsdb_allocate_arrow_schema(ArrowSchema** schema, ArrowSchema *src) {
14+
int rc = ArrowVariantCallProcessor::allocate_schema((void **)schema, src);
15+
if (!rc) {
16+
// To ensure only the capsule destructor doesn't call a random release ptr
17+
// schema[0]->release = NULL;
18+
}
19+
return rc;
20+
}

0 commit comments

Comments
 (0)