|
| 1 | +#!python3 |
| 2 | + |
| 3 | +import sys |
| 4 | +import time |
| 5 | +import timeit |
| 6 | +import chdb |
| 7 | +import argparse |
| 8 | +import pandas as pd |
| 9 | + |
| 10 | +ch_local = "/auxten/chdb/tests/ch24.5/usr/bin/clickhouse" |
| 11 | +data_path = "/auxten/bench/hits_0.parquet" |
| 12 | + |
| 13 | +queries = [ |
| 14 | + """SELECT COUNT(*) FROM hits;""", |
| 15 | + """SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;""", |
| 16 | + """SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;""", |
| 17 | + """SELECT AVG(UserID) FROM hits;""", |
| 18 | + """SELECT COUNT(DISTINCT UserID) FROM hits;""", |
| 19 | + """SELECT COUNT(DISTINCT SearchPhrase) FROM hits;""", |
| 20 | + """SELECT MIN(EventDate), MAX(EventDate) FROM hits;""", |
| 21 | + """SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;""", |
| 22 | + """SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;""", |
| 23 | + """SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;""", |
| 24 | + """SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;""", |
| 25 | + """SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;""", |
| 26 | + """SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""", |
| 27 | + """SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;""", |
| 28 | + """SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;""", |
| 29 | + """SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;""", |
| 30 | + """SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;""", |
| 31 | + """SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;""", |
| 32 | + """SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;""", |
| 33 | + """SELECT UserID FROM hits WHERE UserID = 435090932899640449;""", |
| 34 | + """SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';""", |
| 35 | + """SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""", |
| 36 | + """SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;""", |
| 37 | + """SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;""", |
| 38 | + """SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;""", |
| 39 | + """SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;""", |
| 40 | + """SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;""", |
| 41 | + """SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;""", |
| 42 | + """SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;""", |
| 43 | + """SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;""", |
| 44 | + """SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;""", |
| 45 | + """SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;""", |
| 46 | + """SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;""", |
| 47 | + """SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;""", |
| 48 | + """SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;""", |
| 49 | + """SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;""", |
| 50 | + """SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;""", |
| 51 | + """SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;""", |
| 52 | + """SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;""", |
| 53 | + """SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;""", |
| 54 | + """SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;""", |
| 55 | + """SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-01' AND toDate(EventDate) <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;""", |
| 56 | + """SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND toDate(EventDate) >= '2013-07-14' AND toDate(EventDate) <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000;""", |
| 57 | +] |
| 58 | + |
| 59 | + |
| 60 | +def chdb_query(i, output, times=1): |
| 61 | + sql = queries[i] |
| 62 | + sql = sql.replace( |
| 63 | + "FROM hits", |
| 64 | + f"FROM file('{data_path}', Parquet)", |
| 65 | + ) |
| 66 | + return execute_query(i, output, times, sql) |
| 67 | + |
| 68 | + |
| 69 | +def execute_query(i, output, times, sql): |
| 70 | + print(f"Q{i}: {sql}") |
| 71 | + time_list = [] |
| 72 | + elapsed_list = [] |
| 73 | + for t in range(times): |
| 74 | + start = timeit.default_timer() |
| 75 | + ret = chdb.query( |
| 76 | + sql, |
| 77 | + output, |
| 78 | + ) |
| 79 | + end = timeit.default_timer() |
| 80 | + time_list.append(round(end - start, 2)) |
| 81 | + elapsed_list.append(round(ret.elapsed(), 2)) |
| 82 | + print(f"Times: {t}") |
| 83 | + print("FuncTime: ", time_list) |
| 84 | + print("Elapsed : ", elapsed_list) |
| 85 | + return (time_list, elapsed_list) |
| 86 | + |
| 87 | + |
| 88 | +hits = None |
| 89 | + |
| 90 | + |
| 91 | +def chdb_query_pandas(i, output, times=1): |
| 92 | + global hits |
| 93 | + if hits is None: |
| 94 | + hits = pd.read_parquet(data_path) |
| 95 | + # fix some types |
| 96 | + hits["EventTime"] = pd.to_datetime(hits["EventTime"], unit="s") |
| 97 | + hits["EventDate"] = pd.to_datetime(hits["EventDate"], unit="D") |
| 98 | + # print(hits["EventDate"][0:10]) |
| 99 | + # fix all object columns to string |
| 100 | + for col in hits.columns: |
| 101 | + if hits[col].dtype == "O": |
| 102 | + # hits[col] = hits[col].astype('string') |
| 103 | + hits[col] = hits[col].astype(str) |
| 104 | + # print(hits.dtypes) |
| 105 | + sql = queries[i] |
| 106 | + sql = sql.replace("FROM hits", f"FROM Python(hits)") |
| 107 | + return execute_query(i, output, times, sql) |
| 108 | + |
| 109 | + |
| 110 | +def exec_ch_local(i, log_level="test", output="Null", times=1): |
| 111 | + f""" |
| 112 | + execute clickhouse local binary like |
| 113 | + /auxten/chdb/tests/ch24.5/usr/bin/clickhouse -q "SELECT COUNT(*) FROM file("{data_path}") WHERE URL LIKE '%google%'" --log-level=trace |
| 114 | + """ |
| 115 | + sql = queries[i] |
| 116 | + sql = sql.replace("FROM hits", f"FROM file('{data_path}', Parquet)") |
| 117 | + import subprocess |
| 118 | + |
| 119 | + cmd = [ |
| 120 | + ch_local, |
| 121 | + "-q", |
| 122 | + sql, |
| 123 | + "--log-level=" + log_level, |
| 124 | + "--time", |
| 125 | + "--output-format=" + output, |
| 126 | + ] |
| 127 | + print(" ".join(cmd)) |
| 128 | + time_list = [] |
| 129 | + for t in range(times): |
| 130 | + start = timeit.default_timer() |
| 131 | + subprocess.run(cmd) |
| 132 | + end = timeit.default_timer() |
| 133 | + time_list.append(round(end - start, 2)) |
| 134 | + print(f"Times: {t}") |
| 135 | + print("ExecTime: ", time_list) |
| 136 | + return time_list |
| 137 | + |
| 138 | + |
| 139 | +chdb_time_list = None |
| 140 | +chdb_elapsed_list = None |
| 141 | +chdb_pandas_time_list = None |
| 142 | +chdb_pandas_elapsed_list = None |
| 143 | +exec_time_list = None |
| 144 | + |
| 145 | +if __name__ == "__main__": |
| 146 | + parser = argparse.ArgumentParser() |
| 147 | + parser.add_argument("query", type=int, help="query index") |
| 148 | + parser.add_argument("output", type=str, help="output format") |
| 149 | + parser.add_argument("--all", action="store_true", help="run all queries") |
| 150 | + parser.add_argument("--times", type=int, default=1, help="run times for each query") |
| 151 | + parser.add_argument("--chdb", action="store_true", help="use chdb to run query") |
| 152 | + parser.add_argument("--pandas", action="store_true", help="use pandas to run query") |
| 153 | + parser.add_argument( |
| 154 | + "--local", action="store_true", help="use local clickhouse binary" |
| 155 | + ) |
| 156 | + parser.add_argument( |
| 157 | + "--log_level", type=str, default="test", help="log level for local" |
| 158 | + ) |
| 159 | + args = parser.parse_args() |
| 160 | + if args.output == "Null": |
| 161 | + args.log_level = "error" |
| 162 | + if args.all: |
| 163 | + all_time_list = [] |
| 164 | + for i in range(len(queries)): |
| 165 | + args.output = "Null" |
| 166 | + args.log_level = "error" |
| 167 | + args.query = i |
| 168 | + tmp = [] |
| 169 | + if args.chdb: |
| 170 | + chdb_time_list, chdb_elapsed_list = chdb_query( |
| 171 | + args.query, args.output, args.times |
| 172 | + ) |
| 173 | + tmp.append(chdb_time_list) |
| 174 | + tmp.append(chdb_elapsed_list) |
| 175 | + if args.pandas: |
| 176 | + chdb_pandas_time_list, chdb_pandas_elapsed_list = chdb_query_pandas( |
| 177 | + args.query, args.output, args.times |
| 178 | + ) |
| 179 | + tmp.append(chdb_pandas_time_list) |
| 180 | + tmp.append(chdb_pandas_elapsed_list) |
| 181 | + if args.local: |
| 182 | + exec_time_list = exec_ch_local( |
| 183 | + args.query, args.log_level, args.output, args.times |
| 184 | + ) |
| 185 | + tmp.append(exec_time_list) |
| 186 | + all_time_list.append(tmp) |
| 187 | + # convert to pandas with columns like chdb_time_list, chdb_elapsed_list |
| 188 | + df = pd.DataFrame(all_time_list) |
| 189 | + columns = [] |
| 190 | + if args.chdb: |
| 191 | + columns += ["chdb_time", "chdb_elapsed"] |
| 192 | + if args.pandas: |
| 193 | + columns += ["chdb_pd_time", "chdb_pd_elapsed"] |
| 194 | + if args.local: |
| 195 | + columns += ["ch_local_time"] |
| 196 | + df.columns = columns |
| 197 | + print("All queries:") |
| 198 | + print(df) |
| 199 | + sys.exit(0) |
| 200 | + |
| 201 | + if args.chdb: |
| 202 | + chdb_time_list, chdb_elapsed_list = chdb_query( |
| 203 | + args.query, args.output, args.times |
| 204 | + ) |
| 205 | + if args.pandas: |
| 206 | + chdb_pandas_time_list, chdb_pandas_elapsed_list = chdb_query_pandas( |
| 207 | + args.query, args.output, args.times |
| 208 | + ) |
| 209 | + if args.local: |
| 210 | + exec_time_list = exec_ch_local( |
| 211 | + args.query, args.log_level, args.output, args.times |
| 212 | + ) |
| 213 | + |
| 214 | + # print summary |
| 215 | + print(f"Q{args.query}: {queries[args.query]}") |
| 216 | + print("Summary:") |
| 217 | + print(f"chdb_time_list: {chdb_time_list}, elapsed: {chdb_elapsed_list}") |
| 218 | + print(f"chdb_pd_time_list: {chdb_pandas_time_list}, elapsed: {chdb_pandas_elapsed_list}") |
| 219 | + print(f"local_time_list: {exec_time_list}") |
0 commit comments