Skip to content
This repository was archived by the owner on May 24, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e703bff
refactor: job isolation done
Mohiiit Jan 22, 2025
ff84440
fix: tests fixed
Mohiiit Jan 22, 2025
decc1ad
feat: metadata struct introduced
Mohiiit Jan 23, 2025
d11c246
refactor: metadata introduced to each job, happy flow working
Mohiiit Jan 23, 2025
a35594a
chore: comments resolved
Mohiiit Jan 31, 2025
7336d08
fix: e2e test should work
Mohiiit Jan 31, 2025
37cff7a
fix: fixing the snos fact
Mohiiit Jan 31, 2025
d0f2e34
chore: linting and e2e test
Mohiiit Jan 31, 2025
11d3be9
fix: snos full output set to false in e2e
Mohiiit Jan 31, 2025
b1ba517
chore: linting
Mohiiit Jan 31, 2025
9c2c64f
chore: linting, and addressing a few comments
Mohiiit Jan 31, 2025
e3dcaea
Merge branch 'main' into refactor/job-isolation
Mohiiit Jan 31, 2025
04e9324
chore: docs added
Mohiiit Jan 31, 2025
ac2ee05
chore: comments resolved
Mohiiit Mar 4, 2025
fe0f37a
refactor(tests): all tests updated with the latest metadata changes
Mohiiit Mar 4, 2025
d490a65
refactor: metadata refactored
Mohiiit Mar 4, 2025
3408e91
Merge branch 'main' into refactor/job-isolation
Mohiiit Mar 4, 2025
d7603bd
chore: fixing post merge
Mohiiit Mar 4, 2025
e4acd86
chore(ci): fixing the workflow to use rustup show
Mohiiit Mar 4, 2025
f46286d
chore(ci): fixing the rustup command
Mohiiit Mar 4, 2025
93d0db6
chore(ci): fixing the test ci
Mohiiit Mar 4, 2025
11cf440
chore(ci): fixing the coverage and e2e workflows
Mohiiit Mar 4, 2025
bb6f271
chore(ci): reverting to initial version
Mohiiit Mar 4, 2025
d25b21d
fix(ci): coverage workflow updated v1
Mohiiit Mar 4, 2025
4a2c548
fix(ci): coverage workflow updated v2
Mohiiit Mar 4, 2025
de27ab8
fix(ci): coverage workflow updated v3
Mohiiit Mar 4, 2025
a2ee8de
fix(ci): coverage workflow updated v4
Mohiiit Mar 5, 2025
114043b
fix(ci): coverage workflow updated v5
Mohiiit Mar 5, 2025
fcc9bde
fix(ci): coverage workflow updated v6
Mohiiit Mar 5, 2025
dd03ea6
fix(ci): coverage workflow updated v7
Mohiiit Mar 5, 2025
639084d
chore(fix): tests fixed and ci workflow fixed for e2e
Mohiiit Mar 5, 2025
685daf1
fix(test): state update create job test fixed
Mohiiit Mar 5, 2025
4fcbf71
chore(e2e): test time fixed and expectedDBState updated
Mohiiit Mar 5, 2025
f5a123e
refactor: constants fixed
Mohiiit Mar 5, 2025
c648a2f
refactor(tests): build job item moved to common
Mohiiit Mar 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,14 @@ jobs:
sudo apt update
sudo apt-get install -y clang llvm libudev-dev protobuf-compiler gcc g++ build-essential libssl-dev pkg-config curl wget git libclang-dev

- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly
- name: Install Rust toolchain using rustup
run: |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
rustup toolchain install nightly-2024-09-05 --profile minimal
rustup default nightly-2024-09-05
rustup override set nightly-2024-09-05
rustup show

- name: Rust Cache
uses: Swatinem/rust-cache@v2
Expand Down Expand Up @@ -106,6 +109,8 @@ jobs:
- name: Build Madara
working-directory: madara
run: |
source "$HOME/.cargo/env"
rustup toolchain install 1.81-x86_64-unknown-linux-gnu --profile minimal
cargo build
mv target/debug/madara ../madara-binary
cd ..
Expand All @@ -124,15 +129,19 @@ jobs:
make snos

- name: Check rust version
run: rustup show
run: |
source "$HOME/.cargo/env"
rustup show

- name: Run llvm-cov tests
env:
MADARA_ORCHESTRATOR_ETHEREUM_SETTLEMENT_RPC_URL: ${{ secrets.ETHEREUM_SEPOLIA_BLAST_RPC }}
MADARA_ORCHESTRATOR_RPC_FOR_SNOS: ${{ secrets.RPC_FOR_SNOS }}
# the self hosted runner has a different region so we override it here
AWS_REGION: us-east-1
run: RUST_LOG=debug RUST_BACKTRACE=1 cargo llvm-cov nextest --release --features testing --lcov --output-path lcov.info --test-threads=1 --workspace --exclude=e2e-tests --no-fail-fast
run: |
source "$HOME/.cargo/env"
RUST_LOG=debug RUST_BACKTRACE=1 cargo llvm-cov nextest --release --features testing --lcov --output-path lcov.info --test-threads=1 --workspace --exclude=e2e-tests --no-fail-fast

- name: Coveralls
uses: coverallsapp/github-action@v2
Expand Down
14 changes: 9 additions & 5 deletions .github/workflows/e2e-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@ jobs:
sudo apt update
sudo apt-get install -y clang llvm libudev-dev protobuf-compiler gcc g++ build-essential libssl-dev pkg-config curl wget git libclang-dev

- name: Install Rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: nightly
- name: Install Rust toolchain using rustup
run: |
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
rustup toolchain install nightly-2024-09-05 --profile minimal
rustup default nightly-2024-09-05
rustup override set nightly-2024-09-05
rustup show

- name: Rust Cache
uses: Swatinem/rust-cache@v2
Expand Down Expand Up @@ -89,4 +92,5 @@ jobs:
# the self hosted runner has a different region so we override it here
AWS_REGION: us-east-1
run: |
source "$HOME/.cargo/env"
RUST_LOG=info cargo test --features testing test_orchestrator_workflow -- --nocapture
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## Added

- added metadata serialization and deserialization
- Limits on SNOS job concurrency
- Added JOB_METADATA_PROCESSING_STARTED_AT
- Added retry job endpoint for failed jobs
Expand Down Expand Up @@ -53,6 +54,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## Changed

- refactor: job isolation added, each job will have needed information from it's worker
- Increased interval time for spawn_consumer
- verify_job now handles VerificationTimeout status
- refactor: expect removed and added error wraps
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ axum-macros = "0.4.1"
bincode = "1.3.3"
bytes = "1.7.2"
color-eyre = "0.6.2"
chrono = "0.4.0"
chrono = { version = "0.4", features = ["serde"] }
c-kzg = "1.0.3"
dotenvy = "0.15.7"
futures = "0.3.30"
Expand Down
24 changes: 18 additions & 6 deletions crates/orchestrator/src/database/mongodb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,14 +216,26 @@ impl Database for MongoDb {

tracing::debug!(job_type = ?job_type, category = "db_call", "Fetching latest job by type");

// Get the first (and only) result if it exists
match cursor.try_next().await? {
Some(doc) => {
let job: JobItem = mongodb::bson::from_document(doc)?;
let attributes = [KeyValue::new("db_operation_name", "get_latest_job_by_type")];
let duration = start.elapsed();
ORCHESTRATOR_METRICS.db_calls_response_time.record(duration.as_secs_f64(), &attributes);
Ok(Some(job))
// Try to deserialize and log any errors
match mongodb::bson::from_document::<JobItem>(doc.clone()) {
Ok(job) => {
tracing::debug!(deserialized_job = ?job, "Successfully deserialized job");
let attributes = [KeyValue::new("db_operation_name", "get_latest_job_by_type")];
let duration = start.elapsed();
ORCHESTRATOR_METRICS.db_calls_response_time.record(duration.as_secs_f64(), &attributes);
Ok(Some(job))
}
Err(e) => {
tracing::error!(
error = %e,
document = ?doc,
"Failed to deserialize document into JobItem"
);
Err(eyre!("Failed to deserialize document: {}", e))
}
}
}
None => Ok(None),
}
Expand Down
14 changes: 0 additions & 14 deletions crates/orchestrator/src/jobs/constants.rs

This file was deleted.

75 changes: 59 additions & 16 deletions crates/orchestrator/src/jobs/da_job/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use std::sync::Arc;

use async_trait::async_trait;
use chrono::{SubsecRound, Utc};
use color_eyre::eyre::WrapErr;
use color_eyre::eyre::{eyre, WrapErr};
use lazy_static::lazy_static;
use num_bigint::{BigUint, ToBigUint};
use num_traits::{Num, Zero};
Expand All @@ -19,8 +19,8 @@ use uuid::Uuid;
use super::types::{JobItem, JobStatus, JobType, JobVerificationStatus};
use super::{Job, JobError, OtherError};
use crate::config::Config;
use crate::constants::BLOB_DATA_FILE_NAME;
use crate::helpers;
use crate::jobs::metadata::{DaMetadata, JobMetadata, JobSpecificMetadata};
use crate::jobs::state_update_job::utils::biguint_vec_to_u8_vec;

lazy_static! {
Expand Down Expand Up @@ -70,7 +70,7 @@ impl Job for DaJob {
&self,
_config: Arc<Config>,
internal_id: String,
metadata: HashMap<String, String>,
metadata: JobMetadata,
) -> Result<JobItem, JobError> {
let job_id = Uuid::new_v4();
tracing::info!(log_type = "starting", category = "da", function_type = "create_job", block_no = %internal_id, "DA job creation started.");
Expand All @@ -92,7 +92,21 @@ impl Job for DaJob {
#[tracing::instrument(fields(category = "da"), skip(self, config), ret, err)]
async fn process_job(&self, config: Arc<Config>, job: &mut JobItem) -> Result<String, JobError> {
let internal_id = job.internal_id.clone();
tracing::info!(log_type = "starting", category = "da", function_type = "process_job", job_id = ?job.id, block_no = %internal_id, "DA job processing started.");
tracing::info!(
log_type = "starting",
category = "da",
function_type = "process_job",
job_id = ?job.id,
block_no = %internal_id,
"DA job processing started."
);

// Get DA-specific metadata
let mut da_metadata: DaMetadata = job.metadata.specific.clone().try_into().map_err(|e| {
tracing::error!(job_id = ?job.id, error = ?e, "Invalid metadata type for DA job");
JobError::Other(OtherError(e))
})?;

let block_no = job.internal_id.parse::<u64>().wrap_err("Failed to parse u64".to_string()).map_err(|e| {
tracing::error!(job_id = ?job.id, error = ?e, "Failed to parse block number");
JobError::Other(OtherError(e))
Expand All @@ -116,13 +130,14 @@ impl Job for DaJob {
MaybePendingStateUpdate::Update(state_update) => state_update,
};
tracing::debug!(job_id = ?job.id, "Retrieved state update");

// constructing the data from the rpc
let blob_data = state_update_to_blob_data(block_no, state_update, config.clone()).await.map_err(|e| {
tracing::error!(job_id = ?job.id, error = ?e, "Failed to convert state update to blob data");
JobError::Other(OtherError(e))
})?;

// transforming the data so that we can apply FFT on this.
// @note: we can skip this step if in the above step we return vec<BigUint> directly
let blob_data_biguint = convert_to_biguint(blob_data.clone());
tracing::trace!(job_id = ?job.id, "Converted blob data to BigUint");

Expand All @@ -131,16 +146,26 @@ impl Job for DaJob {
tracing::error!(job_id = ?job.id, error = ?e, "Failed to apply FFT transformation");
JobError::Other(OtherError(e))
})?;
// data transformation on the data
tracing::trace!(job_id = ?job.id, "Applied FFT transformation");

store_blob_data(transformed_data.clone(), block_no, config.clone()).await?;
// Get blob data path from metadata
let blob_data_path = da_metadata.blob_data_path.as_ref().ok_or_else(|| {
tracing::error!(job_id = ?job.id, "Blob data path not found in metadata");
JobError::Other(OtherError(eyre!("Blob data path not found in metadata")))
})?;

// Store the transformed data
store_blob_data(transformed_data.clone(), blob_data_path, config.clone()).await?;
tracing::debug!(job_id = ?job.id, "Stored blob data");

let max_bytes_per_blob = config.da_client().max_bytes_per_blob().await;
let max_blob_per_txn = config.da_client().max_blob_per_txn().await;
tracing::trace!(job_id = ?job.id, max_bytes_per_blob = max_bytes_per_blob, max_blob_per_txn = max_blob_per_txn, "Retrieved DA client configuration");
// converting BigUints to Vec<u8>, one Vec<u8> represents one blob data
tracing::trace!(
job_id = ?job.id,
max_bytes_per_blob = max_bytes_per_blob,
max_blob_per_txn = max_blob_per_txn,
"Retrieved DA client configuration"
);

let blob_array = data_to_blobs(max_bytes_per_blob, transformed_data)?;
let current_blob_length: u64 = blob_array
Expand All @@ -153,9 +178,14 @@ impl Job for DaJob {
})?;
tracing::debug!(job_id = ?job.id, blob_count = current_blob_length, "Converted data to blobs");

// there is a limit on number of blobs per txn, checking that here
// Check blob limit
if current_blob_length > max_blob_per_txn {
tracing::warn!(job_id = ?job.id, current_blob_length = current_blob_length, max_blob_per_txn = max_blob_per_txn, "Exceeded maximum number of blobs per transaction");
tracing::error!(
job_id = ?job.id,
current_blob_length = current_blob_length,
max_blob_per_txn = max_blob_per_txn,
"Exceeded maximum number of blobs per transaction"
);
Err(DaError::MaxBlobsLimitExceeded {
max_blob_per_txn,
current_blob_length,
Expand All @@ -164,13 +194,24 @@ impl Job for DaJob {
})?
}

// making the txn to the DA layer
// Publish to DA layer
let external_id = config.da_client().publish_state_diff(blob_array, &[0; 32]).await.map_err(|e| {
tracing::error!(job_id = ?job.id, error = ?e, "Failed to publish state diff to DA layer");
JobError::Other(OtherError(e))
})?;

tracing::info!(log_type = "completed", category = "da", function_type = "process_job", job_id = ?job.id, block_no = %internal_id, external_id = ?external_id, "Successfully published state diff to DA layer.");
da_metadata.tx_hash = Some(external_id.clone());
job.metadata.specific = JobSpecificMetadata::Da(da_metadata);

tracing::info!(
log_type = "completed",
category = "da",
function_type = "process_job",
job_id = ?job.id,
block_no = %internal_id,
external_id = ?external_id,
"Successfully published state diff to DA layer."
);
Ok(external_id)
}

Expand Down Expand Up @@ -352,14 +393,16 @@ pub async fn state_update_to_blob_data(
}

/// To store the blob data using the storage client with path <block_number>/blob_data.txt
async fn store_blob_data(blob_data: Vec<BigUint>, block_number: u64, config: Arc<Config>) -> Result<(), JobError> {
async fn store_blob_data(blob_data: Vec<BigUint>, blob_data_path: &str, config: Arc<Config>) -> Result<(), JobError> {
let storage_client = config.storage();
let key = block_number.to_string() + "/" + BLOB_DATA_FILE_NAME;

let blob_data_vec_u8 = biguint_vec_to_u8_vec(blob_data.as_slice());

if !blob_data_vec_u8.is_empty() {
storage_client.put_data(blob_data_vec_u8.into(), &key).await.map_err(|e| JobError::Other(OtherError(e)))?;
storage_client
.put_data(blob_data_vec_u8.into(), blob_data_path)
.await
.map_err(|e| JobError::Other(OtherError(e)))?;
}

Ok(())
Expand Down
38 changes: 38 additions & 0 deletions crates/orchestrator/src/jobs/metadata/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//! Common metadata shared across all job types.

use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};

/// Common metadata fields shared across all job types.
///
/// # Field Management
/// These fields are automatically managed by the job processing system and should not
/// be modified directly by workers or jobs. The system uses these fields to:
/// - Track processing and verification attempts
/// - Record completion timestamps
/// - Store failure information
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
pub struct CommonMetadata {
/// Number of times the job has been processed
pub process_attempt_no: u64,
/// Number of times the job has been retried after processing failures
pub process_retry_attempt_no: u64,
/// Number of times the job has been verified
pub verification_attempt_no: u64,
/// Number of times the job has been retried after verification failures
pub verification_retry_attempt_no: u64,
/// Timestamp when job processing started
#[serde(with = "chrono::serde::ts_seconds_option")]
pub process_started_at: Option<DateTime<Utc>>,
/// Timestamp when job processing completed
#[serde(with = "chrono::serde::ts_seconds_option")]
pub process_completed_at: Option<DateTime<Utc>>,
/// Timestamp when job verification started
#[serde(with = "chrono::serde::ts_seconds_option")]
pub verification_started_at: Option<DateTime<Utc>>,
/// Timestamp when job verification completed
#[serde(with = "chrono::serde::ts_seconds_option")]
pub verification_completed_at: Option<DateTime<Utc>>,
/// Reason for job failure if any
pub failure_reason: Option<String>,
}
21 changes: 21 additions & 0 deletions crates/orchestrator/src/jobs/metadata/da.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//! Metadata for data availability (DA) jobs.

use serde::{Deserialize, Serialize};

/// Metadata specific to data availability (DA) jobs.
///
/// # Field Management
/// - Worker-initialized fields: block_number and blob_data_path
/// - Job-populated fields: tx_hash (during processing)
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct DaMetadata {
// Worker-initialized fields
/// Block number for data availability
pub block_number: u64,
/// Path to the blob data file
pub blob_data_path: Option<String>,

// Job-populated fields
/// Transaction hash after data submission
pub tx_hash: Option<String>,
}
Loading
Loading