support more benchmarks

ding-young · ding-young · commit 3717b99772e9 · 2025-07-25T02:46:19.000Z
diff --git a/benchmarks/src/bin/mem_profile.rs b/benchmarks/src/bin/mem_profile.rs
@@ -18,100 +18,98 @@
 //! mem_profile binary entrypoint
 use datafusion::error::Result;
 use std::{
+    env,
     io::{BufRead, BufReader},
     process::{Command, Stdio},
 };
 use structopt::StructOpt;
 
-#[derive(Debug, StructOpt)]
-#[structopt(about = "memory profile command")]
-struct MemProfileOpt {
-    #[structopt(subcommand)]
-    command: BenchmarkCommand,
-}
-
-#[derive(Debug, StructOpt)]
-enum BenchmarkCommand {
-    Tpch(TpchOpt),
-    // TODO Add other benchmark commands here
-}
+use datafusion_benchmarks::{
+    clickbench,
+    h2o::{self, AllQueries},
+    imdb, sort_tpch, tpch,
+};
 
 #[derive(Debug, StructOpt)]
-struct TpchOpt {
-    #[structopt(long, required = true)]
-    path: String,
-
-    /// Query number. If not specified, runs all queries
-    #[structopt(short, long)]
-    query: Option<usize>,
+#[structopt(about = "benchmark command")]
+#[allow(dead_code)]
+enum Options {
+    Clickbench(clickbench::RunOpt),
+    H2o(h2o::RunOpt),
+    Imdb(imdb::RunOpt),
+    SortTpch(sort_tpch::RunOpt),
+    Tpch(tpch::RunOpt),
 }
 
 #[tokio::main]
 pub async fn main() -> Result<()> {
     // 1. parse args and check which benchmarks should be run
-    let opt = MemProfileOpt::from_args();
+    // let opt = MemProfileOpt::from_args();
+    let profile = env::var("PROFILE").unwrap_or_else(|_| "release".to_string());
+
+    let args = env::args().skip(1);
+    // let opt = Options::from_iter(args);
+    let query_range = match Options::from_args() {
+        // TODO clickbench
+        // TODO run for specific query id
+        Options::Clickbench(_) => 0..=42,
+        Options::H2o(opt) => {
+            let queries = AllQueries::try_new(&opt.queries_path)?;
+            match opt.query {
+                Some(query_id) => query_id..=query_id,
+                None => queries.min_query_id()..=queries.max_query_id(),
+            }
+        }
+        Options::Imdb(_) => imdb::IMDB_QUERY_START_ID..=imdb::IMDB_QUERY_END_ID,
+        Options::SortTpch(_) => {
+            sort_tpch::SORT_TPCH_QUERY_START_ID..=sort_tpch::SORT_TPCH_QUERY_END_ID
+        }
+        Options::Tpch(_) => tpch::TPCH_QUERY_START_ID..=tpch::TPCH_QUERY_END_ID,
+    };
 
     // 2. prebuild test binary so that memory does not blow up due to build process
-    // check binary file location
     println!("Pre-building benchmark binary...");
     let status = Command::new("cargo")
         .args([
             "build",
             "--profile",
-            "release-nonlto",
+            &profile,
             "--features",
             "mimalloc_extended",
             "--bin",
             "dfbench",
         ])
         .status()
         .expect("Failed to build dfbench");
-
-    if !status.success() {
-        panic!("Failed to build dfbench");
-    }
+    assert!(status.success());
     println!("Benchmark binary built successfully.");
 
-    // 3. create a subprocess, run each benchmark with args (1) (2)
-    match opt.command {
-        BenchmarkCommand::Tpch(tpch_opt) => {
-            run_tpch_benchmark(tpch_opt).await?;
-        }
-    }
+    // 3. spawn a new process per each benchmark query and print summary
+    let mut dfbench_args: Vec<String> = args.collect();
+    println!("{dfbench_args:?}");
+    run_benchmark_as_child_process(&profile, query_range, &mut dfbench_args)?;
 
-    // (maybe we cannot support result file.. and just have to print..)
     Ok(())
 }
 
-async fn run_tpch_benchmark(opt: TpchOpt) -> Result<()> {
-    let mut args: Vec<String> = vec![
-        "./target/release-nonlto/dfbench".to_string(),
-        "tpch".to_string(),
-        "--iterations".to_string(),
-        "1".to_string(),
-        "--path".to_string(),
-        opt.path.clone(),
-        "--format".to_string(),
-        "parquet".to_string(),
-        "--partitions".to_string(),
-        "4".to_string(),
-        "--query".to_string(),
-    ];
-
+fn run_benchmark_as_child_process(
+    profile: &str,
+    query_range: std::ops::RangeInclusive<usize>,
+    args: &mut Vec<String>,
+) -> Result<()> {
     let mut query_strings: Vec<String> = Vec::new();
-    if let Some(query_id) = opt.query {
-        query_strings.push(query_id.to_string());
-    } else {
-        // run all queries.
-        for i in 1..=22 {
-            query_strings.push(i.to_string());
-        }
+    for i in query_range {
+        query_strings.push(i.to_string());
     }
 
+    let command = format!("target/{profile}/dfbench");
+    args.insert(0, command);
+    args.push("--query".to_string());
+
     let mut results = vec![];
     for query_str in query_strings {
         args.push(query_str);
-        let _ = run_query(&args, &mut results);
+        let _ = run_query(args, &mut results);
         args.pop();
     }
 
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -207,9 +207,9 @@ impl RunOpt {
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
         println!("Query {query_id} avg time: {avg:.2} ms");
 
-        if self.common.memory_stat_enabled {
-            print_memory_stats();
-        }
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
         Ok(query_results)
     }
 
diff --git a/benchmarks/src/h2o.rs b/benchmarks/src/h2o.rs
@@ -20,7 +20,7 @@
 //! - [H2O AI Benchmark](https://duckdb.org/2023/04/14/h2oai.html)
 //! - [Extended window function benchmark](https://duckdb.org/2024/06/26/benchmarks-over-time.html#window-functions-benchmark)
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt};
 use datafusion::logical_expr::{ExplainFormat, ExplainOption};
 use datafusion::{error::Result, prelude::SessionContext};
 use datafusion_common::{
@@ -34,7 +34,7 @@ use structopt::StructOpt;
 #[structopt(verbatim_doc_comment)]
 pub struct RunOpt {
     #[structopt(short, long)]
-    query: Option<usize>,
+    pub query: Option<usize>,
 
     /// Common options
     #[structopt(flatten)]
@@ -48,7 +48,7 @@ pub struct RunOpt {
         long = "queries-path",
         default_value = "benchmarks/queries/h2o/groupby.sql"
     )]
-    queries_path: PathBuf,
+    pub queries_path: PathBuf,
 
     /// Path to data file (parquet or csv)
     /// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark
@@ -132,6 +132,9 @@ impl RunOpt {
             let avg = millis.iter().sum::<f64>() / millis.len() as f64;
             println!("Query {query_id} avg time: {avg:.2} ms");
 
+            // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+            print_memory_stats();
+
             if self.common.debug {
                 ctx.sql(sql)
                     .await?
@@ -197,12 +200,12 @@ impl RunOpt {
     }
 }
 
-struct AllQueries {
+pub struct AllQueries {
     queries: Vec<String>,
 }
 
 impl AllQueries {
-    fn try_new(path: &Path) -> Result<Self> {
+    pub fn try_new(path: &Path) -> Result<Self> {
         let all_queries = std::fs::read_to_string(path)
             .map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
 
@@ -212,7 +215,7 @@ impl AllQueries {
     }
 
     /// Returns the text of query `query_id`
-    fn get_query(&self, query_id: usize) -> Result<&str> {
+    pub fn get_query(&self, query_id: usize) -> Result<&str> {
         self.queries
             .get(query_id - 1)
             .ok_or_else(|| {
@@ -225,11 +228,11 @@ impl AllQueries {
             .map(|s| s.as_str())
     }
 
-    fn min_query_id(&self) -> usize {
+    pub fn min_query_id(&self) -> usize {
         1
     }
 
-    fn max_query_id(&self) -> usize {
+    pub fn max_query_id(&self) -> usize {
         self.queries.len()
     }
 }
diff --git a/benchmarks/src/imdb/mod.rs b/benchmarks/src/imdb/mod.rs
@@ -54,6 +54,9 @@ pub const IMDB_TABLES: &[&str] = &[
     "person_info",
 ];
 
+pub const IMDB_QUERY_START_ID: usize = 1;
+pub const IMDB_QUERY_END_ID: usize = 113;
+
 /// Get the schema for the IMDB dataset tables
 /// see benchmarks/data/imdb/schematext.sql
 pub fn get_imdb_table_schema(table: &str) -> Schema {
diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs
@@ -18,8 +18,11 @@
 use std::path::PathBuf;
 use std::sync::Arc;
 
-use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
-use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use super::{
+    get_imdb_table_schema, get_query_sql, IMDB_QUERY_END_ID, IMDB_QUERY_START_ID,
+    IMDB_TABLES,
+};
+use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
@@ -91,9 +94,6 @@ pub struct RunOpt {
     prefer_hash_join: BoolDefaultTrue,
 }
 
-const IMDB_QUERY_START_ID: usize = 1;
-const IMDB_QUERY_END_ID: usize = 113;
-
 fn map_query_id_to_str(query_id: usize) -> &'static str {
     match query_id {
         // 1
@@ -341,6 +341,9 @@ impl RunOpt {
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
         println!("Query {query_id} avg time: {avg:.2} ms");
 
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
         Ok(query_results)
     }
 
diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs
@@ -40,7 +40,7 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 
-use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
+use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
 
 #[derive(Debug, StructOpt)]
 pub struct RunOpt {
@@ -74,6 +74,9 @@ pub struct RunOpt {
     limit: Option<usize>,
 }
 
+pub const SORT_TPCH_QUERY_START_ID: usize = 1;
+pub const SORT_TPCH_QUERY_END_ID: usize = 11;
+
 impl RunOpt {
     const SORT_TABLES: [&'static str; 1] = ["lineitem"];
 
@@ -178,7 +181,7 @@ impl RunOpt {
 
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
-            None => 1..=Self::SORT_QUERIES.len(),
+            None => SORT_TPCH_QUERY_START_ID..=SORT_TPCH_QUERY_END_ID,
         };
 
         for query_id in query_range {
@@ -246,6 +249,9 @@ impl RunOpt {
         let avg = millis.iter().sum::<f64>() / millis.len() as f64;
         println!("Q{query_id} avg time: {avg:.2} ms");
 
+        // Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
+        print_memory_stats();
+
         Ok(query_results)
     }
 
diff --git a/benchmarks/src/tpch/mod.rs b/benchmarks/src/tpch/mod.rs
@@ -34,6 +34,9 @@ pub const TPCH_TABLES: &[&str] = &[
     "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region",
 ];
 
+pub const TPCH_QUERY_START_ID: usize = 1;
+pub const TPCH_QUERY_END_ID: usize = 22;
+
 /// The `.tbl` file contains a trailing column
 pub fn get_tbl_tpch_table_schema(table: &str) -> Schema {
     let mut schema = SchemaBuilder::from(get_tpch_table_schema(table).fields);
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -19,7 +19,8 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use super::{
-    get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
+    get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_QUERY_END_ID,
+    TPCH_QUERY_START_ID, TPCH_TABLES,
 };
 use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
 
@@ -97,9 +98,6 @@ pub struct RunOpt {
     sorted: bool,
 }
 
-const TPCH_QUERY_START_ID: usize = 1;
-const TPCH_QUERY_END_ID: usize = 22;
-
 impl RunOpt {
     pub async fn run(self) -> Result<()> {
         println!("Running benchmarks with the following options: {self:?}");