thirdweb-dev · nischitpra · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/cmd/committer.go b/cmd/committer.go
@@ -0,0 +1,29 @@
+package cmd
+
+import (
+	"fmt"
+
+	"github.com/rs/zerolog/log"
+	"github.com/spf13/cobra"
+	"github.com/thirdweb-dev/indexer/internal/committer"
+	"github.com/thirdweb-dev/indexer/internal/rpc"
+)
+
+var committerCmd = &cobra.Command{
+	Use:   "committer",
+	Short: "run committer",
+	Long:  "published data from s3 to kafka. if block is not found in s3, it will panic",
+	Run:   RunCommitter,
+}
+
+func RunCommitter(cmd *cobra.Command, args []string) {
+	fmt.Println("running committer")
+	rpc, err := rpc.Initialize()
+	if err != nil {
+		log.Fatal().Err(err).Msg("Failed to initialize RPC")
+	}
+	chainId := rpc.GetChainID()
+
+	committer.Init(chainId, rpc)
+	committer.Commit(chainId)
+}
diff --git a/cmd/root.go b/cmd/root.go
@@ -429,6 +429,7 @@ func init() {
 
 	rootCmd.AddCommand(orchestratorCmd)
 	rootCmd.AddCommand(apiCmd)
+	rootCmd.AddCommand(committerCmd)
 	rootCmd.AddCommand(validateAndFixCmd)
 	rootCmd.AddCommand(validateCmd)
 	rootCmd.AddCommand(migrateValidationCmd)

diff --git a/configs/config.go b/configs/config.go
@@ -7,6 +7,8 @@ import (
 	"strings"
 	"time"
 
+	"github.com/caarlos0/env"
+	"github.com/joho/godotenv"
 	"github.com/rs/zerolog/log"
 	"github.com/spf13/viper"
 )
@@ -270,11 +272,37 @@ type Config struct {
 	Publisher    PublisherConfig    `mapstructure:"publisher"`
 	Validation   ValidationConfig   `mapstructure:"validation"`
 	Migrator     MigratorConfig     `mapstructure:"migrator"`
+
+	CommitterClickhouseDatabase      string `env:"COMMITTER_CLICKHOUSE_DATABASE"`
+	CommitterClickhouseHost          string `env:"COMMITTER_CLICKHOUSE_HOST"`
+	CommitterClickhousePort          int    `env:"COMMITTER_CLICKHOUSE_PORT"`
+	CommitterClickhouseUsername      string `env:"COMMITTER_CLICKHOUSE_USERNAME"`
+	CommitterClickhousePassword      string `env:"COMMITTER_CLICKHOUSE_PASSWORD"`
+	CommitterClickhouseEnableTLS     bool   `env:"COMMITTER_CLICKHOUSE_ENABLE_TLS" envDefault:"true"`
+	CommitterKafkaBrokers            string `env:"COMMITTER_KAFKA_BROKERS"`
+	CommitterKafkaUsername           string `env:"COMMITTER_KAFKA_USERNAME"`
+	CommitterKafkaPassword           string `env:"COMMITTER_KAFKA_PASSWORD"`
+	CommitterKafkaEnableTLS          bool   `env:"COMMITTER_KAFKA_ENABLE_TLS" envDefault:"true"`
+	StagingS3Bucket                  string `env:"STAGING_S3_BUCKET" envDefault:"thirdweb-insight-production"`
+	StagingS3Region                  string `env:"STAGING_S3_REGION" envDefault:"us-west-2"`
+	StagingS3AccessKeyID             string `env:"STAGING_S3_ACCESS_KEY_ID"`
+	StagingS3SecretAccessKey         string `env:"STAGING_S3_SECRET_ACCESS_KEY"`
+	StagingS3MaxParallelFileDownload int    `env:"STAGING_S3_MAX_PARALLEL_FILE_DOWNLOAD" envDefault:"2"`
+	CommitterRPCNumParallelCalls     int64  `env:"COMMITTER_RPC_NUM_PARALLEL_CALLS" envDefault:"10"`
 }
 
 var Cfg Config
 
 func LoadConfig(cfgFile string) error {
+	err := godotenv.Load()
+	if err != nil {
+		log.Info().Msg("No .env file found")
+	}
+	err = env.Parse(&Cfg)
+	if err != nil {
+		panic(err)
+	}
+
 	if cfgFile != "" {
 		viper.SetConfigFile(cfgFile)
 		if err := viper.ReadInConfig(); err != nil {
@@ -301,7 +329,7 @@ func LoadConfig(cfgFile string) error {
 
 	viper.AutomaticEnv()
 
-	err := viper.Unmarshal(&Cfg)
+	err = viper.Unmarshal(&Cfg)
 	if err != nil {
 		return fmt.Errorf("error unmarshalling config: %v", err)
 	}

diff --git a/go.mod b/go.mod
@@ -52,6 +52,7 @@ require (
 	github.com/bits-and-blooms/bitset v1.20.0 // indirect
 	github.com/bytedance/sonic v1.12.6 // indirect
 	github.com/bytedance/sonic/loader v0.2.1 // indirect
+	github.com/caarlos0/env v3.5.0+incompatible // indirect
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cloudwego/base64x v0.1.4 // indirect
 	github.com/cloudwego/iasm v0.2.0 // indirect
@@ -97,6 +98,7 @@ require (
 	github.com/gorilla/websocket v1.4.2 // indirect
 	github.com/hashicorp/hcl v1.0.0 // indirect
 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
+	github.com/joho/godotenv v1.5.1 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
 	github.com/klauspost/compress v1.18.0 // indirect

diff --git a/go.sum b/go.sum
@@ -61,6 +61,8 @@ github.com/bytedance/sonic v1.12.6/go.mod h1:B8Gt/XvtZ3Fqj+iSKMypzymZxw/FVwgIGKz
 github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
 github.com/bytedance/sonic/loader v0.2.1 h1:1GgorWTqf12TA8mma4DDSbaQigE2wOgQo7iCjjJv3+E=
 github.com/bytedance/sonic/loader v0.2.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
+github.com/caarlos0/env v3.5.0+incompatible h1:Yy0UN8o9Wtr/jGHZDpCBLpNrzcFLLM2yixi/rBrKyJs=
+github.com/caarlos0/env v3.5.0+incompatible/go.mod h1:tdCsowwCzMLdkqRYDlHpZCp2UooDD3MspDBjZ2AD02Y=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
@@ -212,6 +214,8 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
 github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
 github.com/jackpal/go-nat-pmp v1.0.2 h1:KzKSgb7qkJvOUTqYl9/Hg/me3pWgBmERKrTGD7BdWus=
 github.com/jackpal/go-nat-pmp v1.0.2/go.mod h1:QPH045xvCAeXUZOxsnwmrtiCoxIr9eob+4orBN1SBKc=
+github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
+github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
 github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=

diff --git a/internal/committer/README.md b/internal/committer/README.md
@@ -0,0 +1,180 @@
+# Committer Package
+
+This package implements a committer that processes block data from S3 parquet files and publishes them to Kafka. It follows the requirements specified in the original comments.
+
+## Features
+
+- **ClickHouse Integration**: Gets the maximum block number from ClickHouse for the chain
+- **S3 File Discovery**: Lists parquet files from S3 with chain-specific prefixes
+- **Block Range Parsing**: Extracts start and end block numbers from S3 filenames
+- **File Filtering**: Skips files where end block is less than max block number from ClickHouse
+- **Sequential Processing**: Processes files in ascending order by start block number
+- **Memory-Efficient Streaming**: Streams parquet files row-by-row to minimize memory usage
+- **Kafka Publishing**: Publishes processed block data to Kafka
+- **Error Handling**: Comprehensive error handling with detailed logging
+
+## Usage
+
+### Basic Usage
+
+```go
+package main
+
+import (
+    "context"
+    "math/big"
+    "log"
+
+    "github.com/thirdweb-dev/indexer/internal/committer"
+    "github.com/thirdweb-dev/indexer/configs"
+)
+
+func main() {
+    // Load configuration
+    if err := configs.LoadConfig("config.yml"); err != nil {
+        log.Fatal("Failed to load config:", err)
+    }
+
+    // Create committer for chain ID 1 (Ethereum mainnet)
+    chainId := big.NewInt(1)
+    committer, err := committer.NewCommitterFromConfig(chainId)
+    if err != nil {
+        log.Fatal("Failed to create committer:", err)
+    }
+    defer committer.Close()
+
+    // Process blocks
+    ctx := context.Background()
+    if err := committer.ProcessBlocks(ctx); err != nil {
+        log.Fatal("Failed to process blocks:", err)
+    }
+}
+```
+
+### Advanced Usage with Custom Configuration
+
+```go
+package main
+
+import (
+    "context"
+    "math/big"
+    "log"
+
+    "github.com/thirdweb-dev/indexer/internal/committer"
+    "github.com/thirdweb-dev/indexer/configs"
+)
+
+func main() {
+    // Custom configuration
+    chainId := big.NewInt(137) // Polygon
+
+    clickhouseConfig := &configs.ClickhouseConfig{
+        Host:     "localhost",
+        Port:     9000,
+        Username: "default",
+        Password: "",
+        Database: "insight",
+    }
+
+    s3Config := &configs.S3Config{
+        Bucket:          "thirdweb-insight-production",
+        Region:          "us-east-1",
+        AccessKeyID:     "your-access-key",
+        SecretAccessKey: "your-secret-key",
+    }
+
+    kafkaConfig := &configs.KafkaConfig{
+        Brokers: "localhost:9092",
+    }
+
+    // Create committer
+    committer, err := committer.NewCommitter(chainId, clickhouseConfig, s3Config, kafkaConfig)
+    if err != nil {
+        log.Fatal("Failed to create committer:", err)
+    }
+    defer committer.Close()
+
+    // Process blocks
+    ctx := context.Background()
+    if err := committer.ProcessBlocks(ctx); err != nil {
+        log.Fatal("Failed to process blocks:", err)
+    }
+}
+```
+
+## Configuration Requirements
+
+The committer requires the following configuration:
+
+### ClickHouse Configuration
+- Host, Port, Username, Password, Database
+- Used to query the maximum block number for the chain
+
+### S3 Configuration
+- Bucket name (e.g., "thirdweb-insight-production")
+- Region, Access Key ID, Secret Access Key
+- Used to list and download parquet files
+
+### Kafka Configuration
+- Brokers list
+- Used to publish processed block data
+
+## S3 File Structure
+
+The committer expects S3 files to follow this naming pattern:
+```
+chain_${chainId}/year=2024/blocks_1000_2000.parquet
+```
+
+Where:
+- `chain_${chainId}` is the prefix for the chain
+- `year=2024` is the partitioning by year
+- `blocks_1000_2000.parquet` contains blocks from 1000 to 2000
+
+## Parquet File Structure
+
+The parquet files should contain the following columns:
+- `chain_id` (uint64): Chain identifier
+- `block_number` (uint64): Block number
+- `block_hash` (string): Block hash
+- `block_timestamp` (int64): Block timestamp
+- `block_json` (bytes): Serialized block data
+- `transactions_json` (bytes): Serialized transactions data
+- `logs_json` (bytes): Serialized logs data
+- `traces_json` (bytes): Serialized traces data
+
+## Processing Flow
+
+1. **Query ClickHouse**: Get the maximum block number for the chain
+2. **List S3 Files**: Find all parquet files with the chain prefix
+3. **Filter Files**: Skip files where end block ≤ max block number
+4. **Sort Files**: Order by start block number (ascending)
+5. **Process Sequentially**: For each file:
+   - Download from S3 to local storage
+   - Stream parquet file row-by-row
+   - Skip blocks < next commit block number
+   - Error if block > next commit block number (missing data)
+   - Publish found blocks to Kafka
+   - Increment commit block number
+   - Clean up local file
+
+## Error Handling
+
+The committer includes comprehensive error handling:
+- Missing configuration validation
+- S3 connection and download errors
+- Parquet file parsing errors
+- Kafka publishing errors
+- Block sequence validation errors
+
+All errors are logged with detailed context for debugging.
+
+## Memory Management
+
+The committer is designed to be memory-efficient:
+- Downloads files directly to disk (no in-memory buffering)
+- Streams parquet files row-by-row
+- Processes one file at a time
+- Cleans up local files after processing
+- Uses semaphores to limit concurrent operations