Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
49 changes: 49 additions & 0 deletions docker/quickstart-flink/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Use Flink 1.20.0 as base image
FROM flink:1.20.0-scala_2.12-java17

# Switch to root user for installation and setup
USER root

# Install necessary packages
RUN apt-get update && \
apt-get install -y tree && \
rm -rf /var/lib/apt/lists/*

# Copy sql-client script to the container
COPY bin/* /opt/sql-client/

# Set working directory and environment
WORKDIR /opt/sql-client
ENV SQL_CLIENT_HOME=/opt/sql-client

# Copy Fluss connector JARs and SQL files
# Copy JARs to both sql-client lib and Flink lib directories
COPY lib/* /opt/sql-client/lib/
COPY sql/* /opt/sql-client/sql/
COPY lib/* /opt/flink/lib/
COPY opt/* /opt/flink/opt/

# Modify docker-entrypoint.sh to allow Flink to run as root user
# This is needed for the quickstart environment
RUN sed -i 's/exec $(drop_privs_cmd)/exec/g' /docker-entrypoint.sh

# Make sql-client script executable
RUN ["chmod", "+x", "/opt/sql-client/sql-client"]
41 changes: 41 additions & 0 deletions docker/quickstart-flink/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Fluss Quickstart Flink Docker

This directory contains the Docker setup for Fluss Quickstart with Flink integration.

## Overview

The Fluss Quickstart Flink Docker image provides a complete environment for running Flink with Fluss, powered by Paimon lake storage.

## Prerequisites

Before building the Docker image, ensure you have:

1. Check out the code version that you want to use for the Docker image. Go to the project root directory and build Fluss using `./mvnw clean package -DskipTests`.
The local build will be used for the Docker image.
2. Docker installed and running
3. Internet access for retrieving dependencies

## Build Process

The build process consists of two main steps:

### Step 1: Prepare Build Files

First, you need to prepare the required JAR files and dependencies:

```bash
# Make the script executable
chmod +x prepare_build.sh

# Run the preparation script
./prepare_build.sh
```

### Step 2: Build Docker Image

After the preparation is complete, build the Docker image:

```bash
# Build the Docker image
docker build -t fluss/quickstart-flink:1.20-latest .
```
21 changes: 21 additions & 0 deletions docker/quickstart-flink/bin/sql-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

${FLINK_HOME}/bin/sql-client.sh -i ${SQL_CLIENT_HOME}/sql/sql-client.sql
234 changes: 234 additions & 0 deletions docker/quickstart-flink/prepare_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
#!/bin/bash

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

# Configuration
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"

# Logging functions
log_info() {
echo "ℹ️ $1"
}

log_success() {
echo "✅ $1"
}

log_error() {
echo "❌ $1" >&2
}

# Utility function to copy JAR files with version numbers
copy_jar() {
local src_pattern="$1"
local dest_dir="$2"
local description="$3"

log_info "Copying $description..."

# Find matching files
local matches=($src_pattern)
local count=${#matches[@]}

# No files matched
if (( count == 0 )); then
log_error "No matching JAR files found: $src_pattern"
log_error "Please build the Fluss project first: mvn clean package"
return 1
fi

# Multiple files matched
if (( count > 1 )); then
log_error "Multiple matching JAR files found:"
printf " %s\n" "${matches[@]}"
return 1
fi

# Exactly one file matched → copy it with original file name
mkdir -p "$dest_dir"
cp "${matches[0]}" "$dest_dir/"
log_success "Copied: $(basename "${matches[0]}")"
}

# Utility function to download and verify JAR
download_jar() {
local url="$1"
local dest_file="$2"
local expected_hash="$3"
local description="$4"

log_info "Downloading $description..."

# Download the file
if ! wget -O "$dest_file" "$url"; then
log_error "Failed to download $description from $url"
return 1
fi

# Verify file size
if [ ! -s "$dest_file" ]; then
log_error "Downloaded file is empty: $dest_file"
return 1
fi

# Verify checksum if provided
if [ -n "$expected_hash" ]; then
local actual_hash=$(shasum "$dest_file" | awk '{print $1}')
if [ "$expected_hash" != "$actual_hash" ]; then
log_error "Checksum mismatch for $description"
log_error "Expected: $expected_hash"
log_error "Actual: $actual_hash"
return 1
fi
log_success "Checksum verified for $description"
else
log_success "Downloaded $description"
fi
}

# Check if required directories exist
check_prerequisites() {
log_info "Checking prerequisites..."

local required_dirs=(
"$PROJECT_ROOT/fluss-flink/fluss-flink-1.20/target"
"$PROJECT_ROOT/fluss-lake/fluss-lake-paimon/target"
"$PROJECT_ROOT/fluss-flink/fluss-flink-tiering/target"
)

for dir in "${required_dirs[@]}"; do
if [ ! -d "$dir" ]; then
log_error "Required directory not found: $dir"
log_error "Please build the Fluss project first: mvn clean package"
exit 1
fi
done

log_success "All prerequisites met"
}

# Main execution
main() {
log_info "Preparing JAR files for Fluss Quickstart Flink Docker..."
log_info "Project root: $PROJECT_ROOT"

# Check prerequisites
check_prerequisites

# Clean and create directories
log_info "Setting up directories..."
rm -rf lib opt
mkdir -p lib opt

# Copy Fluss connector JARs
log_info "Copying Fluss connector JARs..."
copy_jar "$PROJECT_ROOT/fluss-flink/fluss-flink-1.20/target/fluss-flink-1.20-*.jar" "./lib" "fluss-flink-1.20 connector"
copy_jar "$PROJECT_ROOT/fluss-lake/fluss-lake-paimon/target/fluss-lake-paimon-*.jar" "./lib" "fluss-lake-paimon connector"

# Download external dependencies
log_info "Downloading external dependencies..."

# Download flink-faker for data generation
download_jar \
"https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar" \
"./lib/flink-faker-0.5.3.jar" \
"" \
"flink-faker-0.5.3"

# Download flink-shaded-hadoop-2-uber for Hadoop integration
download_jar \
"https://repo1.maven.org/maven2/io/trino/hadoop/hadoop-apache/3.3.5-2/hadoop-apache-3.3.5-2.jar" \
"./lib/hadoop-apache-3.3.5-2.jar" \
"508255883b984483a45ca48d5af6365d4f013bb8" \
"hadoop-apache-3.3.5-2.jar"

# Download paimon-flink connector
download_jar \
Copy link
Contributor

@MehulBatra MehulBatra Oct 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#1727 For our effort to create quick start guide for iceberg, we need to add the flink iceberg dependencies also for fluss/quickstart-flink image, what do you think @luoyuxia @wuchong ?
PS: we can skip the aws bundle i feel

# Download iceberg-flink-runtime for Flink 1.20
download_jar \
    "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.20/1.9.1/iceberg-flink-runtime-1.20-1.9.1.jar" \
    "./lib/iceberg-flink-runtime-1.20-1.9.1.jar" \
    "" \
    "iceberg-flink-runtime-1.20-1.9.1"

# Download iceberg-aws-bundle for S3/Glue support
download_jar \
    "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.9.1/iceberg-aws-bundle-1.9.1.jar" \
    "./lib/iceberg-aws-bundle-1.9.1.jar" \
    "" \
    "iceberg-aws-bundle-1.9.1"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also while running tiering service for iceberg I found out we need avro jar also else we get the following error:

Caused by: java.lang.NoSuchMethodError: 'org.apache.avro.LogicalTypes$TimestampNanos org.apache.avro.LogicalTypes.timestampNanos()'
    at org.apache.iceberg.avro.TypeToSchema.<clinit>(TypeToSchema.java:50)
    # Download Avro 1.12.0
    download_jar \
        "https://repo1.maven.org/maven2/org/apache/avro/avro/1.12.0/avro-1.12.0.jar" \
        "./lib/avro-1.12.0.jar" \
        "" \
        "avro-1.12.0"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MehulBatra should we also add this instruction to the iceberg integration documentation?

https://fluss.apache.org/docs/next/streaming-lakehouse/integrate-data-lakes/iceberg/

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggest to do it in #1727. Let's just focus on quickstart-paimon in this pr. The reason is that:

  • What's jars to be add for iceberg quickstart should depend on what the iceberg quickstart looks like. For example, if no s3, we don't need to add iceberg-aws-bundle
  • There should be some class conflicts that should be resolved, for example, iceberg 1.9.0 has also drop the support for hadoop2 but the quickstart-paimon use hadoop2 jar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also while running tiering service for iceberg I found out we need avro jar also else we get the following error:

Caused by: java.lang.NoSuchMethodError: 'org.apache.avro.LogicalTypes$TimestampNanos org.apache.avro.LogicalTypes.timestampNanos()'
    at org.apache.iceberg.avro.TypeToSchema.<clinit>(TypeToSchema.java:50)
    # Download Avro 1.12.0
    download_jar \
        "https://repo1.maven.org/maven2/org/apache/avro/avro/1.12.0/avro-1.12.0.jar" \
        "./lib/avro-1.12.0.jar" \
        "" \
        "avro-1.12.0"

Yes, that's what I mean class conflict. We put hadooop2 related classes in FLINK_HOME/lib, but iceberg required hadooop3. And the document also said to use hadoop3. I already created a repo https://github.com/luoyuxia/fluss-iceberg/tree/main/flink/lib to verify fluss interating with iceberg. You can refer to here for what jars added.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now, I append a commit to use hadoop3 and the quickstart-paimon still works. After upgrade to hadoop3, iceberg can also share this hadoop3 lib without the error:

Caused by: java.lang.NoSuchMethodError: 'org.apache.avro.LogicalTypes$TimestampNanos org.apache.avro.LogicalTypes.timestampNanos()'
    at org.apache.iceberg.avro.TypeToSchema.<clinit>(TypeToSchema.java:50)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wuchong I have added the jars required specifically for the iceberg quickstart for a self hosted flink environment part!

Copy link
Contributor

@MehulBatra MehulBatra Oct 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@luoyuxia I was unaware for Adding hadoop 3 should resolve avro error without needing seperate avro jar, will incorporate that in quickstart!

"https://repo1.maven.org/maven2/org/apache/paimon/paimon-flink-1.20/1.2.0/paimon-flink-1.20-1.2.0.jar" \
"./lib/paimon-flink-1.20-1.2.0.jar" \
"b9f8762c6e575f6786f1d156a18d51682ffc975c" \
"paimon-flink-1.20-1.2.0"

# Prepare lake tiering JAR
log_info "Preparing lake tiering JAR..."
copy_jar "$PROJECT_ROOT/fluss-flink/fluss-flink-tiering/target/fluss-flink-tiering-*.jar" "./opt" "fluss-flink-tiering"

# Final verification
verify_jars

# Show summary
show_summary
}

# Verify that all required JAR files are present
verify_jars() {
log_info "Verifying all required JAR files are present..."

local missing_jars=()
local lib_jars=(
"fluss-flink-1.20-*.jar"
"fluss-lake-paimon-*.jar"
"flink-faker-0.5.3.jar"
"hadoop-apache-3.3.5-2.jar"
"paimon-flink-1.20-1.2.0.jar"
)

local opt_jars=(
"fluss-flink-tiering-*.jar"
)

# Check lib directory
for jar_pattern in "${lib_jars[@]}"; do
if ! ls ./lib/$jar_pattern >/dev/null 2>&1; then
missing_jars+=("lib/$jar_pattern")
fi
done

# Check opt directory
for jar_pattern in "${opt_jars[@]}"; do
if ! ls ./opt/$jar_pattern >/dev/null 2>&1; then
missing_jars+=("opt/$jar_pattern")
fi
done

# Report results
if [ ${#missing_jars[@]} -eq 0 ]; then
log_success "All required JAR files are present!"
else
log_error "Missing required JAR files:"
for jar in "${missing_jars[@]}"; do
log_error " - $jar"
done
exit 1
fi
}

# Summary function
show_summary() {
log_success "JAR files preparation completed!"
echo ""
log_info "📦 Generated JAR files:"
echo "Lib directory:"
ls -la ./lib/ 2>/dev/null || echo " (empty)"
echo "Opt directory:"
ls -la ./opt/ 2>/dev/null || echo " (empty)"
}

# Run main function
main "$@"
Loading