-
Notifications
You must be signed in to change notification settings - Fork 27
committer redo #287
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
committer redo #287
Changes from 21 commits
0761c4f
5e0891d
5c67d50
72a3824
cc4c7be
af465ef
ba8846e
5dd4004
3719824
00ce565
12e2005
229f4d0
1be7b1f
41d78c2
7e1f0ac
1c7900c
700befa
27ced99
0d0fa75
7466bc9
b8be8e4
6aa9911
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package cmd | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/rs/zerolog/log" | ||
"github.com/spf13/cobra" | ||
"github.com/thirdweb-dev/indexer/internal/committer" | ||
"github.com/thirdweb-dev/indexer/internal/rpc" | ||
) | ||
|
||
var committerCmd = &cobra.Command{ | ||
Use: "committer", | ||
Short: "run committer", | ||
Long: "published data from s3 to kafka. if block is not found in s3, it will panic", | ||
Run: RunCommitter, | ||
} | ||
|
||
func RunCommitter(cmd *cobra.Command, args []string) { | ||
fmt.Println("running committer") | ||
rpc, err := rpc.Initialize() | ||
if err != nil { | ||
log.Fatal().Err(err).Msg("Failed to initialize RPC") | ||
} | ||
chainId := rpc.GetChainID() | ||
|
||
committer.Init(chainId, rpc) | ||
committer.Commit(chainId) | ||
} | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,8 @@ import ( | |
"strings" | ||
"time" | ||
|
||
"github.com/caarlos0/env" | ||
"github.com/joho/godotenv" | ||
"github.com/rs/zerolog/log" | ||
"github.com/spf13/viper" | ||
) | ||
|
@@ -270,11 +272,37 @@ type Config struct { | |
Publisher PublisherConfig `mapstructure:"publisher"` | ||
Validation ValidationConfig `mapstructure:"validation"` | ||
Migrator MigratorConfig `mapstructure:"migrator"` | ||
|
||
CommitterClickhouseDatabase string `env:"COMMITTER_CLICKHOUSE_DATABASE"` | ||
CommitterClickhouseHost string `env:"COMMITTER_CLICKHOUSE_HOST"` | ||
CommitterClickhousePort int `env:"COMMITTER_CLICKHOUSE_PORT"` | ||
CommitterClickhouseUsername string `env:"COMMITTER_CLICKHOUSE_USERNAME"` | ||
CommitterClickhousePassword string `env:"COMMITTER_CLICKHOUSE_PASSWORD"` | ||
CommitterClickhouseEnableTLS bool `env:"COMMITTER_CLICKHOUSE_ENABLE_TLS" envDefault:"true"` | ||
CommitterKafkaBrokers string `env:"COMMITTER_KAFKA_BROKERS"` | ||
CommitterKafkaUsername string `env:"COMMITTER_KAFKA_USERNAME"` | ||
CommitterKafkaPassword string `env:"COMMITTER_KAFKA_PASSWORD"` | ||
CommitterKafkaEnableTLS bool `env:"COMMITTER_KAFKA_ENABLE_TLS" envDefault:"true"` | ||
StagingS3Bucket string `env:"STAGING_S3_BUCKET" envDefault:"thirdweb-insight-production"` | ||
StagingS3Region string `env:"STAGING_S3_REGION" envDefault:"us-west-2"` | ||
StagingS3AccessKeyID string `env:"STAGING_S3_ACCESS_KEY_ID"` | ||
StagingS3SecretAccessKey string `env:"STAGING_S3_SECRET_ACCESS_KEY"` | ||
StagingS3MaxParallelFileDownload int `env:"STAGING_S3_MAX_PARALLEL_FILE_DOWNLOAD" envDefault:"2"` | ||
CommitterRPCNumParallelCalls int64 `env:"COMMITTER_RPC_NUM_PARALLEL_CALLS" envDefault:"10"` | ||
} | ||
|
||
var Cfg Config | ||
|
||
func LoadConfig(cfgFile string) error { | ||
err := godotenv.Load() | ||
if err != nil { | ||
log.Info().Msg("No .env file found") | ||
} | ||
err = env.Parse(&Cfg) | ||
if err != nil { | ||
panic(err) | ||
} | ||
|
||
Comment on lines
+298
to
+306
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Set precedence to ENV > file and avoid panic; parse env last and return errors. Current flow parses env first, then unmarshals file (file overrides env) and panics on parse errors. Common precedence is ENV > file, and this function already returns errors. @@
- err := godotenv.Load()
- if err != nil {
- log.Info().Msg("No .env file found")
- }
- err = env.Parse(&Cfg)
- if err != nil {
- panic(err)
- }
+ err := godotenv.Load()
+ if err != nil {
+ // .env is optional; keep noise low but retain diagnostics
+ log.Debug().Err(err).Msg("Skipping .env load")
+ }
@@
- err = viper.Unmarshal(&Cfg)
+ err = viper.Unmarshal(&Cfg)
if err != nil {
return fmt.Errorf("error unmarshalling config: %v", err)
}
+
+ // Parse env-tagged fields last so ENV overrides file values.
+ if err := env.Parse(&Cfg); err != nil {
+ return fmt.Errorf("error parsing env into config: %w", err)
+ } Also applies to: 332-336 🤖 Prompt for AI Agents
|
||
if cfgFile != "" { | ||
viper.SetConfigFile(cfgFile) | ||
if err := viper.ReadInConfig(); err != nil { | ||
|
@@ -301,7 +329,7 @@ func LoadConfig(cfgFile string) error { | |
|
||
viper.AutomaticEnv() | ||
|
||
err := viper.Unmarshal(&Cfg) | ||
err = viper.Unmarshal(&Cfg) | ||
if err != nil { | ||
return fmt.Errorf("error unmarshalling config: %v", err) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# Committer Package | ||
|
||
This package implements a committer that processes block data from S3 parquet files and publishes them to Kafka. It follows the requirements specified in the original comments. | ||
|
||
## Features | ||
|
||
- **ClickHouse Integration**: Gets the maximum block number from ClickHouse for the chain | ||
- **S3 File Discovery**: Lists parquet files from S3 with chain-specific prefixes | ||
- **Block Range Parsing**: Extracts start and end block numbers from S3 filenames | ||
- **File Filtering**: Skips files where end block is less than max block number from ClickHouse | ||
- **Sequential Processing**: Processes files in ascending order by start block number | ||
- **Memory-Efficient Streaming**: Streams parquet files row-by-row to minimize memory usage | ||
- **Kafka Publishing**: Publishes processed block data to Kafka | ||
- **Error Handling**: Comprehensive error handling with detailed logging | ||
|
||
## Usage | ||
|
||
### Basic Usage | ||
|
||
```go | ||
package main | ||
|
||
import ( | ||
"context" | ||
"math/big" | ||
"log" | ||
|
||
"github.com/thirdweb-dev/indexer/internal/committer" | ||
"github.com/thirdweb-dev/indexer/configs" | ||
) | ||
|
||
func main() { | ||
// Load configuration | ||
if err := configs.LoadConfig("config.yml"); err != nil { | ||
log.Fatal("Failed to load config:", err) | ||
} | ||
|
||
// Create committer for chain ID 1 (Ethereum mainnet) | ||
chainId := big.NewInt(1) | ||
committer, err := committer.NewCommitterFromConfig(chainId) | ||
if err != nil { | ||
log.Fatal("Failed to create committer:", err) | ||
} | ||
defer committer.Close() | ||
|
||
// Process blocks | ||
ctx := context.Background() | ||
if err := committer.ProcessBlocks(ctx); err != nil { | ||
log.Fatal("Failed to process blocks:", err) | ||
} | ||
} | ||
``` | ||
Comment on lines
+20
to
+52
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Examples reference non-existent APIs; align with actual public surface. README shows NewCommitterFromConfig/NewCommitter/ProcessBlocks, but the code exposes Init/Commit (and a CLI subcommand). Please update the examples to either:
I can draft a corrected snippet once you confirm the intended public API. Also applies to: 56-104 |
||
|
||
### Advanced Usage with Custom Configuration | ||
|
||
```go | ||
package main | ||
|
||
import ( | ||
"context" | ||
"math/big" | ||
"log" | ||
|
||
"github.com/thirdweb-dev/indexer/internal/committer" | ||
"github.com/thirdweb-dev/indexer/configs" | ||
) | ||
|
||
func main() { | ||
// Custom configuration | ||
chainId := big.NewInt(137) // Polygon | ||
|
||
clickhouseConfig := &configs.ClickhouseConfig{ | ||
Host: "localhost", | ||
Port: 9000, | ||
Username: "default", | ||
Password: "", | ||
Database: "insight", | ||
} | ||
|
||
s3Config := &configs.S3Config{ | ||
Bucket: "thirdweb-insight-production", | ||
Region: "us-east-1", | ||
AccessKeyID: "your-access-key", | ||
SecretAccessKey: "your-secret-key", | ||
} | ||
|
||
kafkaConfig := &configs.KafkaConfig{ | ||
Brokers: "localhost:9092", | ||
} | ||
|
||
// Create committer | ||
committer, err := committer.NewCommitter(chainId, clickhouseConfig, s3Config, kafkaConfig) | ||
if err != nil { | ||
log.Fatal("Failed to create committer:", err) | ||
} | ||
defer committer.Close() | ||
|
||
// Process blocks | ||
ctx := context.Background() | ||
if err := committer.ProcessBlocks(ctx); err != nil { | ||
log.Fatal("Failed to process blocks:", err) | ||
} | ||
} | ||
``` | ||
|
||
## Configuration Requirements | ||
|
||
The committer requires the following configuration: | ||
|
||
### ClickHouse Configuration | ||
- Host, Port, Username, Password, Database | ||
- Used to query the maximum block number for the chain | ||
|
||
### S3 Configuration | ||
- Bucket name (e.g., "thirdweb-insight-production") | ||
- Region, Access Key ID, Secret Access Key | ||
- Used to list and download parquet files | ||
|
||
### Kafka Configuration | ||
- Brokers list | ||
- Used to publish processed block data | ||
|
||
## S3 File Structure | ||
|
||
The committer expects S3 files to follow this naming pattern: | ||
``` | ||
chain_${chainId}/year=2024/blocks_1000_2000.parquet | ||
``` | ||
|
||
Where: | ||
- `chain_${chainId}` is the prefix for the chain | ||
- `year=2024` is the partitioning by year | ||
- `blocks_1000_2000.parquet` contains blocks from 1000 to 2000 | ||
|
||
## Parquet File Structure | ||
|
||
The parquet files should contain the following columns: | ||
- `chain_id` (uint64): Chain identifier | ||
- `block_number` (uint64): Block number | ||
- `block_hash` (string): Block hash | ||
- `block_timestamp` (int64): Block timestamp | ||
- `block_json` (bytes): Serialized block data | ||
- `transactions_json` (bytes): Serialized transactions data | ||
- `logs_json` (bytes): Serialized logs data | ||
- `traces_json` (bytes): Serialized traces data | ||
|
||
## Processing Flow | ||
|
||
1. **Query ClickHouse**: Get the maximum block number for the chain | ||
2. **List S3 Files**: Find all parquet files with the chain prefix | ||
3. **Filter Files**: Skip files where end block ≤ max block number | ||
4. **Sort Files**: Order by start block number (ascending) | ||
5. **Process Sequentially**: For each file: | ||
- Download from S3 to local storage | ||
- Stream parquet file row-by-row | ||
- Skip blocks < next commit block number | ||
- Error if block > next commit block number (missing data) | ||
- Publish found blocks to Kafka | ||
- Increment commit block number | ||
- Clean up local file | ||
|
||
## Error Handling | ||
|
||
The committer includes comprehensive error handling: | ||
- Missing configuration validation | ||
- S3 connection and download errors | ||
- Parquet file parsing errors | ||
- Kafka publishing errors | ||
- Block sequence validation errors | ||
|
||
All errors are logged with detailed context for debugging. | ||
|
||
## Memory Management | ||
|
||
The committer is designed to be memory-efficient: | ||
- Downloads files directly to disk (no in-memory buffering) | ||
- Streams parquet files row-by-row | ||
- Processes one file at a time | ||
- Cleans up local files after processing | ||
- Uses semaphores to limit concurrent operations |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add proper error handling and resource cleanup.
The function should properly handle errors and clean up resources when the committer finishes or encounters an error.
🤖 Prompt for AI Agents