Skip to content

Integration Tests

Integration Tests #269

name: Integration Tests
permissions:
contents: read
on:
schedule:
- cron: '0 3 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
integration-tests:
runs-on: ubuntu-latest
timeout-minutes: 30
name: Run pytest integration suite
steps:
- uses: actions/checkout@v6
with:
submodules: true
- name: Set up Java (required for PySpark)
uses: actions/setup-java@v5
with:
distribution: temurin
java-version: '17'
- name: Start MinIO (S3-compatible storage)
run: |
docker run -d --rm \
--name minio \
-p 9000:9000 \
-p 9001:9001 \
-e MINIO_ACCESS_KEY=AKIAIOSFODNN7EXAMPLE \
-e MINIO_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY \
-e MINIO_ROOT_USER=AKIAIOSFODNN7EXAMPLE \
-e MINIO_ROOT_PASSWORD=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY \
minio/minio server /data --console-address ":9001"
echo "Starting MinIO..."
# Wait for MinIO to become ready
for i in {1..30}; do
if curl -fsS http://localhost:9000/minio/health/live >/dev/null 2>&1; then
echo "MinIO is healthy"; break; fi; sleep 2; done
- name: Create MinIO bucket and upload test data
run: |
# Install MinIO client
wget https://dl.min.io/client/mc/release/linux-amd64/mc
chmod +x mc
sudo mv mc /usr/local/bin/
# Configure MinIO client
mc alias set minio http://localhost:9000 AKIAIOSFODNN7EXAMPLE wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY
# Create bucket
mc mb minio/embucket
- name: Start embucket server (Docker)
run: |
docker run -d --rm \
--name embucket-server \
-p 3000:3000 \
-e OBJECT_STORE_BACKEND=file \
-e FILE_STORAGE_PATH=data/ \
-e DATA_FORMAT=json \
-e SLATEDB_PREFIX=sdb/ \
-e CORS_ENABLED=true \
-e CORS_ALLOW_ORIGIN="http://localhost:8080" \
-e JWT_SECRET=63f4945d921d599f27ae4fdf5bada3f1 \
-e CATALOG_URL=http://localhost:3000/catalog \
-e S3_ENDPOINT=http://localhost:9000 \
-e AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE \
-e AWS_SECRET_ACCESS_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY \
-e AWS_REGION=us-east-2 \
-e S3_BUCKET=embucket \
-e S3_ALLOW_HTTP=true \
-v ${{ github.workspace }}/test/integration:/app/datasets \
--network host \
embucket/embucket
echo "Starting Embucket server..."
# Wait for the server to become ready
for i in {1..60}; do
if curl -fsS http://localhost:3000/health >/dev/null 2>&1; then
echo "Embucket is healthy"; break; fi; sleep 2; done || true
# Fallback wait to ensure readiness
sleep 10
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.12'
- name: Install Python dependencies
working-directory: ./test/integration
run: |
python -m pip install --upgrade pip
pip install -r requirements-test.txt
- name: Set environment variables for integration tests
run: |
echo "EMBUCKET_ICEBERG_REST_URI=http://localhost:3000/catalog" >> $GITHUB_ENV
echo "EMBUCKET_DATABASE=analytics" >> $GITHUB_ENV
echo "S3_ENDPOINT=http://localhost:9000" >> $GITHUB_ENV
echo "S3_ACCESS_KEY=AKIAIOSFODNN7EXAMPLE" >> $GITHUB_ENV
echo "S3_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" >> $GITHUB_ENV
echo "S3_BUCKET=embucket" >> $GITHUB_ENV
echo "EMBUCKET_SQL_HOST=localhost" >> $GITHUB_ENV
echo "EMBUCKET_SQL_PORT=3000" >> $GITHUB_ENV
echo "EMBUCKET_SQL_PROTOCOL=http" >> $GITHUB_ENV
echo "EMBUCKET_USER=embucket" >> $GITHUB_ENV
echo "EMBUCKET_PASSWORD=embucket" >> $GITHUB_ENV
echo "EMBUCKET_SCHEMA=public" >> $GITHUB_ENV
echo "AWS_REGION=us-east-2" >> $GITHUB_ENV
echo "LOCAL_BASE_PATH=/app/datasets" >> $GITHUB_ENV
- name: Set up DuckDB
uses: opt-nc/setup-duckdb-action@v1.1.5
- name: Download and Prepare Datasets
working-directory: ./test/integration
run: |
# Download existing files
curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet
curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2025-01.parquet
curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2025-01.parquet
curl -o hits.parquet https://storage.googleapis.com/glaredb-bench/data/clickbench/partitioned/hits_35.parquet
mkdir -p tpch_data tpcds_data
duckdb -c "CALL dbgen(sf=0.1); export database 'tpch_data' (format parquet);"
duckdb -c "CALL dsdgen(sf=0.1); export database 'tpcds_data' (format parquet);"
tree
- name: Run integration tests
working-directory: ./test/integration
run: pytest -v
- name: Display test failures
if: always()
working-directory: ./test/integration
run: python display_test_failures.py