Skip to content

Commit 602dad6

Browse files
authored
Merge pull request #256 from chdb-io/pipeline-up-perf-tuning
Performace tuning of pipeline running
2 parents 022b8ac + 3c8cd64 commit 602dad6

File tree

16 files changed

+5399
-454
lines changed

16 files changed

+5399
-454
lines changed

benchmark/cb_index.html.tpl

Lines changed: 917 additions & 0 deletions
Large diffs are not rendered by default.

benchmark/clickbench.py

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
#!python3
2+
3+
import sys
4+
import time
5+
import timeit
6+
import chdb
7+
import argparse
8+
import pandas as pd
9+
10+
ch_local = "/auxten/chdb/tests/ch24.5/usr/bin/clickhouse"
11+
data_path = "/auxten/bench/hits_0.parquet"
12+
13+
queries = [
14+
"""SELECT COUNT(*) FROM hits;""",
15+
"""SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;""",
16+
"""SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;""",
17+
"""SELECT AVG(UserID) FROM hits;""",
18+
"""SELECT COUNT(DISTINCT UserID) FROM hits;""",
19+
"""SELECT COUNT(DISTINCT SearchPhrase) FROM hits;""",
20+
"""SELECT MIN(EventDate), MAX(EventDate) FROM hits;""",
21+
"""SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;""",
22+
"""SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;""",
23+
"""SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;""",
24+
"""SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;""",
25+
"""SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;""",
26+
"""SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""",
27+
"""SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;""",
28+
"""SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;""",
29+
"""SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;""",
30+
"""SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;""",
31+
"""SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;""",
32+
"""SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;""",
33+
"""SELECT UserID FROM hits WHERE UserID = 435090932899640449;""",
34+
"""SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';""",
35+
"""SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""",
36+
"""SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""",
37+
"""SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;""",
38+
"""SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;""",
39+
"""SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;""",
40+
"""SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;""",
41+
"""SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;""",
42+
"""SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;""",
43+
"""SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;""",
44+
"""SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;""",
45+
"""SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;""",
46+
"""SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;""",
47+
"""SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;""",
48+
"""SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;""",
49+
"""SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;""",
50+
"""SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;""",
51+
"""SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;""",
52+
"""SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;""",
53+
"""SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;""",
54+
"""SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;""",
55+
"""SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;""",
56+
"""SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-14' AND toDate(EventDate) <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000;""",
57+
]
58+
59+
60+
def chdb_query(i, output, times=1):
61+
sql = queries[i]
62+
sql = sql.replace(
63+
"FROM hits",
64+
f"FROM file('{data_path}', Parquet)",
65+
)
66+
return execute_query(i, output, times, sql)
67+
68+
69+
def execute_query(i, output, times, sql):
70+
print(f"Q{i}: {sql}")
71+
time_list = []
72+
elapsed_list = []
73+
for t in range(times):
74+
start = timeit.default_timer()
75+
ret = chdb.query(
76+
sql,
77+
output,
78+
)
79+
end = timeit.default_timer()
80+
time_list.append(round(end - start, 2))
81+
elapsed_list.append(round(ret.elapsed(), 2))
82+
print(f"Times: {t}")
83+
print("FuncTime: ", time_list)
84+
print("Elapsed : ", elapsed_list)
85+
return (time_list, elapsed_list)
86+
87+
88+
hits = None
89+
90+
91+
def chdb_query_pandas(i, output, times=1):
92+
global hits
93+
if hits is None:
94+
hits = pd.read_parquet(data_path)
95+
# fix some types
96+
hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s")
97+
hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D")
98+
# print(hits["EventDate"][0:10])
99+
# fix all object columns to string
100+
for col in hits.columns:
101+
if hits[col].dtype == "O":
102+
# hits[col] = hits[col].astype('string')
103+
hits[col] = hits[col].astype(str)
104+
# print(hits.dtypes)
105+
sql = queries[i]
106+
sql = sql.replace("FROM hits", f"FROM Python(hits)")
107+
return execute_query(i, output, times, sql)
108+
109+
110+
def exec_ch_local(i, log_level="test", output="Null", times=1):
111+
f"""
112+
execute clickhouse local binary like
113+
/auxten/chdb/tests/ch24.5/usr/bin/clickhouse -q "SELECT COUNT(*) FROM file("{data_path}") WHERE URL LIKE '%google%'" --log-level=trace
114+
"""
115+
sql = queries[i]
116+
sql = sql.replace("FROM hits", f"FROM file('{data_path}', Parquet)")
117+
import subprocess
118+
119+
cmd = [
120+
ch_local,
121+
"-q",
122+
sql,
123+
"--log-level=" + log_level,
124+
"--time",
125+
"--output-format=" + output,
126+
]
127+
print(" ".join(cmd))
128+
time_list = []
129+
for t in range(times):
130+
start = timeit.default_timer()
131+
subprocess.run(cmd)
132+
end = timeit.default_timer()
133+
time_list.append(round(end - start, 2))
134+
print(f"Times: {t}")
135+
print("ExecTime: ", time_list)
136+
return time_list
137+
138+
139+
chdb_time_list = None
140+
chdb_elapsed_list = None
141+
chdb_pandas_time_list = None
142+
chdb_pandas_elapsed_list = None
143+
exec_time_list = None
144+
145+
if __name__ == "__main__":
146+
parser = argparse.ArgumentParser()
147+
parser.add_argument("query", type=int, help="query index")
148+
parser.add_argument("output", type=str, help="output format")
149+
parser.add_argument("--all", action="store_true", help="run all queries")
150+
parser.add_argument("--times", type=int, default=1, help="run times for each query")
151+
parser.add_argument("--chdb", action="store_true", help="use chdb to run query")
152+
parser.add_argument("--pandas", action="store_true", help="use pandas to run query")
153+
parser.add_argument(
154+
"--local", action="store_true", help="use local clickhouse binary"
155+
)
156+
parser.add_argument(
157+
"--log_level", type=str, default="test", help="log level for local"
158+
)
159+
args = parser.parse_args()
160+
if args.output == "Null":
161+
args.log_level = "error"
162+
if args.all:
163+
all_time_list = []
164+
for i in range(len(queries)):
165+
args.output = "Null"
166+
args.log_level = "error"
167+
args.query = i
168+
tmp = []
169+
if args.chdb:
170+
chdb_time_list, chdb_elapsed_list = chdb_query(
171+
args.query, args.output, args.times
172+
)
173+
tmp.append(chdb_time_list)
174+
tmp.append(chdb_elapsed_list)
175+
if args.pandas:
176+
chdb_pandas_time_list, chdb_pandas_elapsed_list = chdb_query_pandas(
177+
args.query, args.output, args.times
178+
)
179+
tmp.append(chdb_pandas_time_list)
180+
tmp.append(chdb_pandas_elapsed_list)
181+
if args.local:
182+
exec_time_list = exec_ch_local(
183+
args.query, args.log_level, args.output, args.times
184+
)
185+
tmp.append(exec_time_list)
186+
all_time_list.append(tmp)
187+
# convert to pandas with columns like chdb_time_list, chdb_elapsed_list
188+
df = pd.DataFrame(all_time_list)
189+
columns = []
190+
if args.chdb:
191+
columns += ["chdb_time", "chdb_elapsed"]
192+
if args.pandas:
193+
columns += ["chdb_pd_time", "chdb_pd_elapsed"]
194+
if args.local:
195+
columns += ["ch_local_time"]
196+
df.columns = columns
197+
print("All queries:")
198+
print(df)
199+
sys.exit(0)
200+
201+
if args.chdb:
202+
chdb_time_list, chdb_elapsed_list = chdb_query(
203+
args.query, args.output, args.times
204+
)
205+
if args.pandas:
206+
chdb_pandas_time_list, chdb_pandas_elapsed_list = chdb_query_pandas(
207+
args.query, args.output, args.times
208+
)
209+
if args.local:
210+
exec_time_list = exec_ch_local(
211+
args.query, args.log_level, args.output, args.times
212+
)
213+
214+
# print summary
215+
print(f"Q{args.query}: {queries[args.query]}")
216+
print("Summary:")
217+
print(f"chdb_time_list: {chdb_time_list}, elapsed: {chdb_elapsed_list}")
218+
print(f"chdb_pd_time_list: {chdb_pandas_time_list}, elapsed: {chdb_pandas_elapsed_list}")
219+
print(f"local_time_list: {exec_time_list}")

0 commit comments

Comments
 (0)