Merge pull request #18 from auxten/tests

auxten · web-flow · commit 1340b708b850 · 2023-04-22T23:45:52.000+08:00
Run all tests after build
diff --git a/.github/workflows/build_wheels.yml b/.github/workflows/build_wheels.yml
@@ -120,6 +120,13 @@ jobs:
           sudo rm -f dist/*-linux_x86_64.whl
           ls -lh dist
         shell: bash
+      - name: Run tests
+        run: |
+          python3 -m pip install dist/*.whl
+          python3 -m pip install pandas pyarrow
+          python3 -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(res.get_memview().tobytes())"
+          make test
+        continue-on-error: false
       - uses: actions/upload-artifact@v3
         with:
           path: ./dist/*.whl
@@ -231,10 +238,12 @@ jobs:
       - name: Fix wheel platform tag
         run: |
           python3 -m wheel tags --platform-tag=macosx_10_15_x86_64 --remove dist/*.whl
-      - name: Run simple test
+      - name: Run tests
         run: |
           python3 -m pip install dist/*.whl
+          python3 -m pip install pandas pyarrow
           python3 -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(res.get_memview().tobytes())"
+          make test
         continue-on-error: false
       - name: Show files
         run: ls -lh dist
@@ -336,7 +345,8 @@ jobs:
           CIBW_BEFORE_BUILD: "pip install -U pip tox pybind11 && bash -x gen_manifest.sh && bash chdb/build.sh"
           CIBW_BUILD_VERBOSITY: 3
           CIBW_BUILD: "cp37-macosx_x86_64 cp38-macosx_x86_64 cp39-macosx_x86_64 cp310-macosx_x86_64"
-          CIBW_TEST_COMMAND: python -c "import chdb; res = chdb.query('select 1112222222,555', 'CSV'); print(res.get_memview().tobytes())"
+          CIBW_TEST_REQUIRES: "pyarrow pandas"
+          CIBW_TEST_COMMAND: "cd {project} && make test"
         # with:
         #   package-dir: .
         #   output-dir: wheelhouse
diff --git a/Makefile b/Makefile
@@ -10,6 +10,10 @@ wheel:
 	tox -e build -- --wheel
 	@echo "Done."
 
+test:
+	@echo "Testing..."
+	cd tests && python3 run_all.py
+
 pub:
 	@echo "Publishing wheel..."
 	tox -e publish
diff --git a/chdb/__init__.py b/chdb/__init__.py
@@ -1,7 +1,7 @@
 import sys
 import os
 
-chdb_version = (0, 5, 0)
+chdb_version = (0, 6, 0)
 if sys.version_info[:2] >= (3, 7):
     # get the path of the current file
     current_path = os.path.dirname(os.path.abspath(__file__))
diff --git a/chdb/__main__.py b/chdb/__main__.py
@@ -21,7 +21,7 @@ def main():
     sql = options.sql[0]
     output_format = options.format
     res = query(sql, output_format)
-    if output_format.lower() == 'dataframe':
+    if output_format.lower() in ('dataframe', 'arrowtable'):
         temp = res
     else:
         temp = res.data()
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/format_output.py b/tests/format_output.py
diff --git a/tests/gen_format_cases.py b/tests/gen_format_cases.py
@@ -0,0 +1,42 @@
+#!python3
+
+import os
+import re
+import pprint
+import chdb
+from utils import current_dir, data_file, reset_elapsed
+
+# some formats are not supported on chdb, so we need to skip them
+# TODO: add support for these formats
+#   ["Template", "PrettyJSONEachRow", "Prometheus", "Protobuf", "ProtobufSingle", "Avro", "CapnProto", "MsgPack"]
+formats = ["TabSeparated",  "TabSeparatedRaw", "TabSeparatedWithNames", "TabSeparatedWithNamesAndTypes", "TabSeparatedRawWithNames",
+           "TabSeparatedRawWithNamesAndTypes", "CSV", "CSVWithNames", "CSVWithNamesAndTypes", "CustomSeparated",
+           "CustomSeparatedWithNames", "CustomSeparatedWithNamesAndTypes", "SQLInsert", "Values", "Vertical", "JSON", "JSONStrings",
+           "JSONColumns", "JSONColumnsWithMetadata", "JSONCompact", "JSONCompactStrings", "JSONCompactColumns", "JSONEachRow",
+           "JSONEachRowWithProgress", "JSONStringsEachRow", "JSONStringsEachRowWithProgress", "JSONCompactEachRow",
+           "JSONCompactEachRowWithNames", "JSONCompactEachRowWithNamesAndTypes", "JSONCompactStringsEachRow",
+           "JSONCompactStringsEachRowWithNames", "JSONCompactStringsEachRowWithNamesAndTypes", "JSONObjectEachRow", "BSONEachRow",
+           "TSKV", "Pretty", "PrettyNoEscapes", "PrettyMonoBlock", "PrettyNoEscapesMonoBlock", "PrettyCompact", "PrettyCompactNoEscapes",
+           "PrettyCompactMonoBlock", "PrettyCompactNoEscapesMonoBlock", "PrettySpace", "PrettySpaceNoEscapes", "PrettySpaceMonoBlock",
+           "PrettySpaceNoEscapesMonoBlock", "Parquet", "ArrowTable",
+           "ORC", "RowBinary", "RowBinaryWithNames", "RowBinaryWithNamesAndTypes", "Native", "Null", "XML", "LineAsString",
+           "RawBLOB", "Markdown"]
+
+# generate test cases for each format and output
+
+format_output = {}
+
+
+for fmt in formats:
+    res = chdb.query("SELECT * FROM file('" + data_file + "', Parquet) limit 10", fmt)
+    if fmt == "ArrowTable":
+        data = reset_elapsed(f"{res}")
+    else:
+        data = reset_elapsed(res.get_memview().tobytes())
+    print("format: " + fmt + " size: " + str(len(data)))
+    format_output[fmt] = {"len": len(data), "data": data}
+
+# dump to py dict for import later
+with open(os.path.join(current_dir, "format_output.py"), "w") as f:
+    f.write("format_output = ")
+    pprint.pprint(format_output, stream=f)
diff --git a/tests/run_all.py b/tests/run_all.py
@@ -0,0 +1,15 @@
+#!python3
+
+import unittest
+
+test_loader = unittest.TestLoader()
+test_suite = test_loader.discover('./')
+
+test_runner = unittest.TextTestRunner()
+ret = test_runner.run(test_suite)
+
+# if any test fails, exit with non-zero code
+if len(ret.failures) > 0 or len(ret.errors) > 0:
+    exit(1)
+else:
+    exit(0)
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -0,0 +1,25 @@
+#!python3
+
+import os
+import unittest
+import chdb
+from format_output import format_output
+from utils import data_file, reset_elapsed
+
+class TestBasic(unittest.TestCase):
+    def test_basic(self):
+        res = chdb.query("SELECT 1", "CSV")
+        self.assertEqual(len(res.get_memview().tobytes()), 2) # "1\n"
+class TestOutput(unittest.TestCase):
+    def test_output(self):
+        for format, output in format_output.items():
+            res = chdb.query("SELECT * FROM file('" + data_file + "', Parquet) limit 10", format)
+            if format == "ArrowTable":
+                data = reset_elapsed(f"{res}")
+            else:
+                data = reset_elapsed(res.get_memview().tobytes())
+            self.assertEqual(data, output["data"])
+    
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
@@ -1,12 +1,16 @@
-#!/usr/bin/env python3
+#!python3
 import concurrent.futures
 import time
 import sys
-import _chdb as chdb
+import os
+import chdb
+import unittest
 
 # run query parallel in n thread and benchmark
-thread_count = 1
+thread_count = 10
 query_count = 1000
+current_dir=os.path.dirname(os.path.abspath(__file__))
+data_file=os.path.join(current_dir, "../contrib/arrow/cpp/submodules/parquet-testing/data/alltypes_dictionary.parquet")
 
 if len(sys.argv) == 2:
     thread_count = int(sys.argv[1])
@@ -18,7 +22,7 @@
 
 def run_query(query, format):
     res = chdb.query(query, format)
-    print(len(res.get_memview().tobytes()))
+    assert len(res.get_memview().tobytes()) == 2290
 
 def run_queries(query, format, count = query_count):
     for i in range(count):
@@ -37,6 +41,11 @@ def benchmark(query, format, parallel = thread_count, count = query_count):
     wait()
     time_end = time.time()
     print("Time cost:", time_end - time_start, "s")
+    print("QPS:", count / (time_end - time_start))
+
+class TestParallel(unittest.TestCase):
+    def test_parallel(self):
+        benchmark(f"SELECT * FROM file('{data_file}', Parquet) LIMIT 10", "Arrow")
 
 if __name__ == '__main__':
-    benchmark("SELECT * FROM file('/home/Clickhouse/bench/result.parquet', Parquet) LIMIT 10", "Arrow")
+    unittest.main()
diff --git a/tests/test_skeleton.py b/tests/test_skeleton.py
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,22 @@
+import re
+import os
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+data_file = os.path.join(
+    current_dir, "../contrib/arrow/cpp/submodules/parquet-testing/data/alltypes_dictionary.parquet")
+
+# reset elapsed time to 0.0 from output, since it will be different each time
+# eg: "elapsed": 0.001015,
+
+
+def reset_elapsed(input):
+    try:
+        if not isinstance(input, str):
+            input = input.decode()
+        input = re.sub(r'("elapsed": )\d+\.\d+', r'\g<1>0.0', input)
+        input = re.sub(r'(<elapsed>)\d+\.\d+(</elapsed>)', r'\g<1>0.0\g<2>', input)
+        input = re.sub(r'(tz=).*]', r'\g<1>Etc/UTC]', input)
+        input = input.replace('08:', '00:')
+    except UnicodeDecodeError:
+        pass
+    return input