Skip to content

Commit 3717b99

Browse files
committed
support more benchmarks
1 parent d9f45d0 commit 3717b99

File tree

8 files changed

+92
-78
lines changed

8 files changed

+92
-78
lines changed

benchmarks/src/bin/mem_profile.rs

Lines changed: 54 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -18,100 +18,98 @@
1818
//! mem_profile binary entrypoint
1919
use datafusion::error::Result;
2020
use std::{
21+
env,
2122
io::{BufRead, BufReader},
2223
process::{Command, Stdio},
2324
};
2425
use structopt::StructOpt;
2526

26-
#[derive(Debug, StructOpt)]
27-
#[structopt(about = "memory profile command")]
28-
struct MemProfileOpt {
29-
#[structopt(subcommand)]
30-
command: BenchmarkCommand,
31-
}
32-
33-
#[derive(Debug, StructOpt)]
34-
enum BenchmarkCommand {
35-
Tpch(TpchOpt),
36-
// TODO Add other benchmark commands here
37-
}
27+
use datafusion_benchmarks::{
28+
clickbench,
29+
h2o::{self, AllQueries},
30+
imdb, sort_tpch, tpch,
31+
};
3832

3933
#[derive(Debug, StructOpt)]
40-
struct TpchOpt {
41-
#[structopt(long, required = true)]
42-
path: String,
43-
44-
/// Query number. If not specified, runs all queries
45-
#[structopt(short, long)]
46-
query: Option<usize>,
34+
#[structopt(about = "benchmark command")]
35+
#[allow(dead_code)]
36+
enum Options {
37+
Clickbench(clickbench::RunOpt),
38+
H2o(h2o::RunOpt),
39+
Imdb(imdb::RunOpt),
40+
SortTpch(sort_tpch::RunOpt),
41+
Tpch(tpch::RunOpt),
4742
}
4843

4944
#[tokio::main]
5045
pub async fn main() -> Result<()> {
5146
// 1. parse args and check which benchmarks should be run
52-
let opt = MemProfileOpt::from_args();
47+
// let opt = MemProfileOpt::from_args();
48+
let profile = env::var("PROFILE").unwrap_or_else(|_| "release".to_string());
49+
50+
let args = env::args().skip(1);
51+
// let opt = Options::from_iter(args);
52+
let query_range = match Options::from_args() {
53+
// TODO clickbench
54+
// TODO run for specific query id
55+
Options::Clickbench(_) => 0..=42,
56+
Options::H2o(opt) => {
57+
let queries = AllQueries::try_new(&opt.queries_path)?;
58+
match opt.query {
59+
Some(query_id) => query_id..=query_id,
60+
None => queries.min_query_id()..=queries.max_query_id(),
61+
}
62+
}
63+
Options::Imdb(_) => imdb::IMDB_QUERY_START_ID..=imdb::IMDB_QUERY_END_ID,
64+
Options::SortTpch(_) => {
65+
sort_tpch::SORT_TPCH_QUERY_START_ID..=sort_tpch::SORT_TPCH_QUERY_END_ID
66+
}
67+
Options::Tpch(_) => tpch::TPCH_QUERY_START_ID..=tpch::TPCH_QUERY_END_ID,
68+
};
5369

5470
// 2. prebuild test binary so that memory does not blow up due to build process
55-
// check binary file location
5671
println!("Pre-building benchmark binary...");
5772
let status = Command::new("cargo")
5873
.args([
5974
"build",
6075
"--profile",
61-
"release-nonlto",
76+
&profile,
6277
"--features",
6378
"mimalloc_extended",
6479
"--bin",
6580
"dfbench",
6681
])
6782
.status()
6883
.expect("Failed to build dfbench");
69-
70-
if !status.success() {
71-
panic!("Failed to build dfbench");
72-
}
84+
assert!(status.success());
7385
println!("Benchmark binary built successfully.");
7486

75-
// 3. create a subprocess, run each benchmark with args (1) (2)
76-
match opt.command {
77-
BenchmarkCommand::Tpch(tpch_opt) => {
78-
run_tpch_benchmark(tpch_opt).await?;
79-
}
80-
}
87+
// 3. spawn a new process per each benchmark query and print summary
88+
let mut dfbench_args: Vec<String> = args.collect();
89+
println!("{dfbench_args:?}");
90+
run_benchmark_as_child_process(&profile, query_range, &mut dfbench_args)?;
8191

82-
// (maybe we cannot support result file.. and just have to print..)
8392
Ok(())
8493
}
8594

86-
async fn run_tpch_benchmark(opt: TpchOpt) -> Result<()> {
87-
let mut args: Vec<String> = vec![
88-
"./target/release-nonlto/dfbench".to_string(),
89-
"tpch".to_string(),
90-
"--iterations".to_string(),
91-
"1".to_string(),
92-
"--path".to_string(),
93-
opt.path.clone(),
94-
"--format".to_string(),
95-
"parquet".to_string(),
96-
"--partitions".to_string(),
97-
"4".to_string(),
98-
"--query".to_string(),
99-
];
100-
95+
fn run_benchmark_as_child_process(
96+
profile: &str,
97+
query_range: std::ops::RangeInclusive<usize>,
98+
args: &mut Vec<String>,
99+
) -> Result<()> {
101100
let mut query_strings: Vec<String> = Vec::new();
102-
if let Some(query_id) = opt.query {
103-
query_strings.push(query_id.to_string());
104-
} else {
105-
// run all queries.
106-
for i in 1..=22 {
107-
query_strings.push(i.to_string());
108-
}
101+
for i in query_range {
102+
query_strings.push(i.to_string());
109103
}
110104

105+
let command = format!("target/{profile}/dfbench");
106+
args.insert(0, command);
107+
args.push("--query".to_string());
108+
111109
let mut results = vec![];
112110
for query_str in query_strings {
113111
args.push(query_str);
114-
let _ = run_query(&args, &mut results);
112+
let _ = run_query(args, &mut results);
115113
args.pop();
116114
}
117115

benchmarks/src/clickbench.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,9 +207,9 @@ impl RunOpt {
207207
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
208208
println!("Query {query_id} avg time: {avg:.2} ms");
209209

210-
if self.common.memory_stat_enabled {
211-
print_memory_stats();
212-
}
210+
// Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
211+
print_memory_stats();
212+
213213
Ok(query_results)
214214
}
215215

benchmarks/src/h2o.rs

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
//! - [H2O AI Benchmark](https://duckdb.org/2023/04/14/h2oai.html)
2121
//! - [Extended window function benchmark](https://duckdb.org/2024/06/26/benchmarks-over-time.html#window-functions-benchmark)
2222
23-
use crate::util::{BenchmarkRun, CommonOpt};
23+
use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt};
2424
use datafusion::logical_expr::{ExplainFormat, ExplainOption};
2525
use datafusion::{error::Result, prelude::SessionContext};
2626
use datafusion_common::{
@@ -34,7 +34,7 @@ use structopt::StructOpt;
3434
#[structopt(verbatim_doc_comment)]
3535
pub struct RunOpt {
3636
#[structopt(short, long)]
37-
query: Option<usize>,
37+
pub query: Option<usize>,
3838

3939
/// Common options
4040
#[structopt(flatten)]
@@ -48,7 +48,7 @@ pub struct RunOpt {
4848
long = "queries-path",
4949
default_value = "benchmarks/queries/h2o/groupby.sql"
5050
)]
51-
queries_path: PathBuf,
51+
pub queries_path: PathBuf,
5252

5353
/// Path to data file (parquet or csv)
5454
/// Default value is the G1_1e7_1e7_100_0.csv file in the h2o benchmark
@@ -132,6 +132,9 @@ impl RunOpt {
132132
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
133133
println!("Query {query_id} avg time: {avg:.2} ms");
134134

135+
// Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
136+
print_memory_stats();
137+
135138
if self.common.debug {
136139
ctx.sql(sql)
137140
.await?
@@ -197,12 +200,12 @@ impl RunOpt {
197200
}
198201
}
199202

200-
struct AllQueries {
203+
pub struct AllQueries {
201204
queries: Vec<String>,
202205
}
203206

204207
impl AllQueries {
205-
fn try_new(path: &Path) -> Result<Self> {
208+
pub fn try_new(path: &Path) -> Result<Self> {
206209
let all_queries = std::fs::read_to_string(path)
207210
.map_err(|e| exec_datafusion_err!("Could not open {path:?}: {e}"))?;
208211

@@ -212,7 +215,7 @@ impl AllQueries {
212215
}
213216

214217
/// Returns the text of query `query_id`
215-
fn get_query(&self, query_id: usize) -> Result<&str> {
218+
pub fn get_query(&self, query_id: usize) -> Result<&str> {
216219
self.queries
217220
.get(query_id - 1)
218221
.ok_or_else(|| {
@@ -225,11 +228,11 @@ impl AllQueries {
225228
.map(|s| s.as_str())
226229
}
227230

228-
fn min_query_id(&self) -> usize {
231+
pub fn min_query_id(&self) -> usize {
229232
1
230233
}
231234

232-
fn max_query_id(&self) -> usize {
235+
pub fn max_query_id(&self) -> usize {
233236
self.queries.len()
234237
}
235238
}

benchmarks/src/imdb/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ pub const IMDB_TABLES: &[&str] = &[
5454
"person_info",
5555
];
5656

57+
pub const IMDB_QUERY_START_ID: usize = 1;
58+
pub const IMDB_QUERY_END_ID: usize = 113;
59+
5760
/// Get the schema for the IMDB dataset tables
5861
/// see benchmarks/data/imdb/schematext.sql
5962
pub fn get_imdb_table_schema(table: &str) -> Schema {

benchmarks/src/imdb/run.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,11 @@
1818
use std::path::PathBuf;
1919
use std::sync::Arc;
2020

21-
use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
22-
use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
21+
use super::{
22+
get_imdb_table_schema, get_query_sql, IMDB_QUERY_END_ID, IMDB_QUERY_START_ID,
23+
IMDB_TABLES,
24+
};
25+
use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
2326

2427
use arrow::record_batch::RecordBatch;
2528
use arrow::util::pretty::{self, pretty_format_batches};
@@ -91,9 +94,6 @@ pub struct RunOpt {
9194
prefer_hash_join: BoolDefaultTrue,
9295
}
9396

94-
const IMDB_QUERY_START_ID: usize = 1;
95-
const IMDB_QUERY_END_ID: usize = 113;
96-
9797
fn map_query_id_to_str(query_id: usize) -> &'static str {
9898
match query_id {
9999
// 1
@@ -341,6 +341,9 @@ impl RunOpt {
341341
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
342342
println!("Query {query_id} avg time: {avg:.2} ms");
343343

344+
// Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
345+
print_memory_stats();
346+
344347
Ok(query_results)
345348
}
346349

benchmarks/src/sort_tpch.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ use datafusion_common::instant::Instant;
4040
use datafusion_common::utils::get_available_parallelism;
4141
use datafusion_common::DEFAULT_PARQUET_EXTENSION;
4242

43-
use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
43+
use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
4444

4545
#[derive(Debug, StructOpt)]
4646
pub struct RunOpt {
@@ -74,6 +74,9 @@ pub struct RunOpt {
7474
limit: Option<usize>,
7575
}
7676

77+
pub const SORT_TPCH_QUERY_START_ID: usize = 1;
78+
pub const SORT_TPCH_QUERY_END_ID: usize = 11;
79+
7780
impl RunOpt {
7881
const SORT_TABLES: [&'static str; 1] = ["lineitem"];
7982

@@ -178,7 +181,7 @@ impl RunOpt {
178181

179182
let query_range = match self.query {
180183
Some(query_id) => query_id..=query_id,
181-
None => 1..=Self::SORT_QUERIES.len(),
184+
None => SORT_TPCH_QUERY_START_ID..=SORT_TPCH_QUERY_END_ID,
182185
};
183186

184187
for query_id in query_range {
@@ -246,6 +249,9 @@ impl RunOpt {
246249
let avg = millis.iter().sum::<f64>() / millis.len() as f64;
247250
println!("Q{query_id} avg time: {avg:.2} ms");
248251

252+
// Print memory usage stats using mimalloc (only when compiled with --features mimalloc_extended)
253+
print_memory_stats();
254+
249255
Ok(query_results)
250256
}
251257

benchmarks/src/tpch/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ pub const TPCH_TABLES: &[&str] = &[
3434
"part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", "region",
3535
];
3636

37+
pub const TPCH_QUERY_START_ID: usize = 1;
38+
pub const TPCH_QUERY_END_ID: usize = 22;
39+
3740
/// The `.tbl` file contains a trailing column
3841
pub fn get_tbl_tpch_table_schema(table: &str) -> Schema {
3942
let mut schema = SchemaBuilder::from(get_tpch_table_schema(table).fields);

benchmarks/src/tpch/run.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ use std::path::PathBuf;
1919
use std::sync::Arc;
2020

2121
use super::{
22-
get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
22+
get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_QUERY_END_ID,
23+
TPCH_QUERY_START_ID, TPCH_TABLES,
2324
};
2425
use crate::util::{print_memory_stats, BenchmarkRun, CommonOpt, QueryResult};
2526

@@ -97,9 +98,6 @@ pub struct RunOpt {
9798
sorted: bool,
9899
}
99100

100-
const TPCH_QUERY_START_ID: usize = 1;
101-
const TPCH_QUERY_END_ID: usize = 22;
102-
103101
impl RunOpt {
104102
pub async fn run(self) -> Result<()> {
105103
println!("Running benchmarks with the following options: {self:?}");

0 commit comments

Comments
 (0)