Skip to content

Commit ba6948f

Browse files
author
Tobias Kopp
committed
[Benchmark] Refactor benchmark system.
Instead of using a different script for each experiment and DBMS to benchmark, implement `connectors` to these DBMS's. The connector has a method to execute an experiment with the given parameters and returns the measured times. In addition, the format of the YAML files of the experiments has been refactored to contain all the information and parameters to execute them on each connector. `Benchmark.py` is refactored as well to read the experiment files and execute them on each available specified connector, with possibly multiple configurations. Some more minor changes: - Benchmark script now has the option to execute one (or multiple) specific experiments. - The `run_id` of each experiment run is tracked and inserted into the database.
1 parent 1d6cd0b commit ba6948f

32 files changed

+3559
-1977
lines changed

benchmark/Benchmark.py

Lines changed: 106 additions & 298 deletions
Large diffs are not rendered by default.

benchmark/_schema.yml

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,19 @@
11
description: str()
2-
version: int(required=False, min=1)
32
suite: str()
43
benchmark: str()
54
name: str(required=False)
65
readonly: bool()
7-
pattern: str()
8-
args: str(required=False)
9-
configurations: map(str(), required=False)
10-
tables: include('table_list')
11-
cases: map(any(str(), include('case')), key=any())
12-
compare_to: map(required=False)
13-
chart: include('chart', required=False)
6+
chart: include('chart_def', required=False)
7+
data: map(include('table'), required=False) # Map from table name to 'table'
8+
systems:
9+
mutable: include('mutable', required=False)
10+
PostgreSQL: include('PostgreSQL', required=False)
11+
DuckDB: include('DuckDB', required=False)
12+
HyPer: include('HyPer', required=False)
1413
---
15-
table:
16-
name: str() # table name
17-
path: str(required=False) # path to table file
18-
sf: num(required=False, min=0, max=1) # scale factor as portion of file to load; defaults to 1
19-
delimiter: str(required=False) # defaults to ','
20-
header: int(required=False) # 1 if file has header, 0 otherwise; defaults to 0
21-
table_list:
22-
list(any(str(), include('table')), required=True)
23-
case:
24-
query: str()
25-
tables: include('table_list')
14+
chart_def:
15+
x: include('axis', required=False)
16+
y: include('axis', required=False)
2617
---
2718
axis:
2819
# Kind of scale, one of
@@ -37,6 +28,28 @@ axis:
3728
type: str(required=False)
3829
# A label for the axis
3930
label: str(required=False)
40-
chart:
41-
x: include('axis', required=False)
42-
y: include('axis', required=False)
31+
---
32+
table:
33+
attributes: map(str(), key=str(), required=False) # table column names and types
34+
file: str(required=False) # path to table file
35+
delimiter: str(required=False) # defaults to ','
36+
header: int() # 1 if file has header, 0 otherwise; defaults to 0
37+
format: str(required=False) # file format
38+
scale_factors: map(num(min=0, max=1), required=False) # map from case name to scale factor (portion of file to load)
39+
lines_in_file: int(required=False) # Number of lines in the file. Is counted and added by the benchmark script
40+
---
41+
mutable:
42+
cases: include('cases')
43+
pattern: str()
44+
args: str(required=False)
45+
configurations: map(str(), required=False)
46+
PostgreSQL:
47+
cases: include('cases')
48+
DuckDB:
49+
cases: include('cases')
50+
HyPer:
51+
single_core: bool(required=False)
52+
all_cores: bool(required=False)
53+
cases: include('cases')
54+
---
55+
cases: map(str(), key=any())
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from abc import ABC, abstractmethod
2+
3+
class Connector(ABC):
4+
5+
# Function that performs an experiment n_runs times given the parameters `params`.
6+
# Returns a dict with the measured times for the experiment and configuration.
7+
# Result has the form:
8+
# results
9+
# └── configurations
10+
# └── cases
11+
# └── times (list)
12+
#
13+
# results: configuration name --> configuration
14+
# configuration: case --> times
15+
# times: list of floats (size=n_runs)
16+
#
17+
# Example: (n_runs=2)
18+
# {
19+
# 'PostgreSQL':
20+
# 1: [1235.093, 1143.43],
21+
# 2: [1033.711, 1337.37],
22+
# 3: [1043.452, 1010.01],
23+
# 4: [1108.702, 1234.56]
24+
# }
25+
@abstractmethod
26+
def execute(self, n_runs: int, params: dict):
27+
pass
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
from .connector import *
2+
3+
import duckdb
4+
import os
5+
import json
6+
7+
8+
TMP_DB = 'tmp.duckdb'
9+
TMP_SQL_FILE = 'tmp.sql'
10+
11+
# TODO way of measuring time is wrong. Use duckdb_cli like in older version.
12+
13+
class DuckDB(Connector):
14+
15+
def __new__(cls, *args, **kwargs):
16+
return super().__new__(cls)
17+
18+
19+
def __init__(self, duckdb_cli, verbose=False):
20+
self.duckdb_cli=duckdb_cli
21+
22+
23+
# Runs an experiment 'n_runs' times, all parameters are in 'params'
24+
def execute(self, n_runs, params: dict):
25+
self.clean_up()
26+
27+
measurement_times = dict() # map that is returned with the measured times
28+
29+
# Check wether tables contain scale factors
30+
with_scale_factors = False
31+
for table in params['data'].values():
32+
if (table.get('scale_factors')):
33+
with_scale_factors = True
34+
break
35+
36+
for _ in range(n_runs):
37+
try:
38+
# Set up database
39+
self.generate_create_table_stmts(params['data'], with_scale_factors)
40+
41+
42+
# If tables contain scale factors, they have to be loaded separately for every case
43+
if (with_scale_factors and bool(params.get('readonly'))):
44+
# Write cases/queries to a file that will be passed to the command to execute
45+
statements = list()
46+
for case, query_stmt in params['cases'].items():
47+
# Create tables from tmp tables with scale factor
48+
for table_name, table in params['data'].items():
49+
statements.append(f"DELETE FROM {table_name};") # empty existing table
50+
if table.get('scale_factors'):
51+
sf = table['scale_factors'][case]
52+
else:
53+
sf = 1
54+
header = int(table.get('header', 0))
55+
num_rows = round((table['lines_in_file'] - header) * sf)
56+
statements.append(f"INSERT INTO {table_name} SELECT * FROM {table_name}_tmp LIMIT {num_rows};")
57+
58+
statements.append(".timer on")
59+
statements.append(query_stmt) # Actual query from this case
60+
statements.append(".timer off")
61+
62+
# Append statements to file
63+
with open(TMP_SQL_FILE, "a+") as tmp:
64+
for stmt in statements:
65+
tmp.write(stmt + "\n")
66+
67+
68+
69+
# Otherwise, tables have to be created just once before the measurements (done above)
70+
else:
71+
# Write cases/queries to a file that will be passed to the command to execute
72+
with open(TMP_SQL_FILE, "a+") as tmp:
73+
tmp.write(".timer on\n")
74+
for case_query in params['cases'].values():
75+
tmp.write(case_query + '\n')
76+
tmp.write(".timer off\n")
77+
78+
79+
# Execute query file and collect measurement data
80+
command = f"./{self.duckdb_cli} {TMP_DB} < {TMP_SQL_FILE}" + " | grep 'Run Time' | cut -d ' ' -f 5 | awk '{print $1 * 1000;}'"
81+
stream = os.popen(f'{command}')
82+
for idx, line in enumerate(stream):
83+
time = float(line.replace("\n", "").replace(",", ".")) # in milliseconds
84+
case = list(params['cases'].keys())[idx]
85+
if case not in measurement_times.keys():
86+
measurement_times[case] = list()
87+
measurement_times[case].append(time)
88+
stream.close()
89+
90+
91+
finally:
92+
self.clean_up()
93+
94+
return {'DuckDB': measurement_times}
95+
96+
97+
# Deletes the used temporary database
98+
def clean_up(self):
99+
if os.path.exists(TMP_DB):
100+
os.remove(TMP_DB)
101+
if os.path.exists(TMP_SQL_FILE):
102+
os.remove(TMP_SQL_FILE)
103+
104+
105+
# Parse attributes of one table, return as string
106+
def parse_attributes(self, attributes: dict):
107+
columns = '('
108+
for column_name, ty in attributes.items():
109+
not_null = 'NOT NULL' if 'NOT NULL' in ty else ''
110+
ty = ty.split(' ')
111+
match (ty[0]):
112+
case 'INT':
113+
typ = 'INT'
114+
case 'CHAR':
115+
typ = f'CHAR({ty[1]})'
116+
case 'DECIMAL':
117+
typ = f'DECIMAL({ty[1]},{ty[2]})'
118+
case 'DATE':
119+
typ = 'DATE'
120+
case 'DOUBLE':
121+
typ = 'DOUBLE'
122+
case 'FLOAT':
123+
typ = 'REAL'
124+
case 'BIGINT':
125+
typ = 'BIGINT'
126+
case _:
127+
raise Exception(f"Unknown type given for '{column_name}'")
128+
columns += f"{column_name} {typ} {not_null}, "
129+
columns = columns[:-2] + ')'
130+
return columns
131+
132+
133+
# Creates tables in the database and copies contents of given files into them
134+
# Call with 'with_scale_factors'=False if data should be loaded as a whole
135+
# Call with 'with_scale_factors'=True if data should be placed in tmp tables
136+
# and copied for each case with different scale factor
137+
def generate_create_table_stmts(self, data: dict, with_scale_factors):
138+
statements = list()
139+
for table_name, table in data.items():
140+
columns = self.parse_attributes(table['attributes'])
141+
142+
delimiter = table.get('delimiter')
143+
header = table.get('header')
144+
format = table['format'].upper()
145+
146+
if with_scale_factors:
147+
table_name += "_tmp"
148+
149+
create = f"CREATE TABLE {table_name} {columns};"
150+
copy = f"COPY {table_name} FROM '{table['file']}' ( "
151+
if delimiter:
152+
delim = delimiter.replace("'", "")
153+
copy += f" DELIMITER \'{delim}\',"
154+
if format:
155+
copy += f" FORMAT {format},"
156+
if header:
157+
copy += f" HEADER," if (header==1) else ""
158+
159+
copy = copy[:-1] + " );"
160+
161+
statements.append(create)
162+
statements.append(copy)
163+
164+
if with_scale_factors:
165+
# Create actual table that will be used for experiment
166+
statements.append(f"CREATE TABLE {table_name[:-4]} {columns};")
167+
168+
with open(TMP_SQL_FILE, "w") as tmp:
169+
for stmt in statements:
170+
tmp.write(stmt + "\n")

0 commit comments

Comments
 (0)