Skip to content

Commit dd68354

Browse files
authored
Merge pull request #335 from wudidapaopao/add-readme
Add README documentation for JSON type support and new interface
2 parents ffc395f + 5f3d06f commit dd68354

File tree

1 file changed

+83
-10
lines changed

1 file changed

+83
-10
lines changed

README.md

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,11 @@
1818

1919

2020
> chDB is an in-process SQL OLAP Engine powered by ClickHouse [^1]
21-
> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
21+
> For more details: [The birth of chDB](https://auxten.com/the-birth-of-chdb/)
2222
2323

2424
## Features
25-
25+
2626
* In-process SQL OLAP Engine, powered by ClickHouse
2727
* No need to install ClickHouse
2828
* Minimized data copy from C++ to Python with [python memoryview](https://docs.python.org/3/c-api/memoryview.html)
@@ -113,7 +113,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
113113
# See more data type format in tests/format_output.py
114114
res = chdb.query('select * from file("data.parquet", Parquet)', 'JSON'); print(res)
115115
res = chdb.query('select * from file("data.csv", CSV)', 'CSV'); print(res)
116-
print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, elapsed {res.elapsed()} seconds")
116+
print(f"SQL read {res.rows_read()} rows, {res.bytes_read()} bytes, storage read {res.storage_rows_read()} rows, {res.storage_bytes_read()} bytes, elapsed {res.elapsed()} seconds")
117117
```
118118

119119
### Pandas dataframe output
@@ -138,6 +138,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
138138
print(ret_tbl)
139139
# Query on the DataFrame Table
140140
print(ret_tbl.query('select b, sum(a) from __table__ group by b'))
141+
# Pandas DataFrames are automatically registered as temporary tables in ClickHouse
142+
chdb.query("SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c").show()
141143
```
142144
</details>
143145

@@ -285,10 +287,19 @@ df = pd.DataFrame(
285287
{
286288
"a": [1, 2, 3, 4, 5, 6],
287289
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
290+
"dict_col": [
291+
{'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
292+
{'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
293+
{'id': 3, 'name': 'tom'},
294+
{'id': 4, 'value': '100'},
295+
{'id': 5, 'value': 101},
296+
{'id': 6, 'value': 102},
297+
],
288298
}
289299
)
290300

291301
chdb.query("SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b").show()
302+
chdb.query("SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'").show()
292303
```
293304

294305
### Query on Arrow Table
@@ -300,12 +311,19 @@ arrow_table = pa.table(
300311
{
301312
"a": [1, 2, 3, 4, 5, 6],
302313
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
314+
"dict_col": [
315+
{'id': 1, 'value': 'tom'},
316+
{'id': 2, 'value': 'jerry'},
317+
{'id': 3, 'value': 'auxten'},
318+
{'id': 4, 'value': 'tom'},
319+
{'id': 5, 'value': 'jerry'},
320+
{'id': 6, 'value': 'auxten'},
321+
],
303322
}
304323
)
305324

306-
chdb.query(
307-
"SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b", "debug"
308-
).show()
325+
chdb.query("SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b").show()
326+
chdb.query("SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'").show()
309327
```
310328

311329
### Query on chdb.PyReader class instance
@@ -329,24 +347,79 @@ class myReader(chdb.PyReader):
329347
def read(self, col_names, count):
330348
print("Python func read", col_names, count, self.cursor)
331349
if self.cursor >= len(self.data["a"]):
350+
self.cursor = 0
332351
return []
333352
block = [self.data[col] for col in col_names]
334353
self.cursor += len(block[0])
335354
return block
336355

356+
def get_schema(self):
357+
return [
358+
("a", "int"),
359+
("b", "str"),
360+
("dict_col", "json")
361+
]
362+
337363
reader = myReader(
338364
{
339365
"a": [1, 2, 3, 4, 5, 6],
340366
"b": ["tom", "jerry", "auxten", "tom", "jerry", "auxten"],
367+
"dict_col": [
368+
{'id': 1, 'tags': ['urgent', 'important'], 'metadata': {'created': '2024-01-01'}},
369+
{'id': 2, 'tags': ['normal'], 'metadata': {'created': '2024-02-01'}},
370+
{'id': 3, 'name': 'tom'},
371+
{'id': 4, 'value': '100'},
372+
{'id': 5, 'value': 101},
373+
{'id': 6, 'value': 102}
374+
],
341375
}
342376
)
343377

344-
chdb.query(
345-
"SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
346-
).show()
378+
chdb.query("SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b").show()
379+
chdb.query("SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'").show()
347380
```
348381

349-
see also: [test_query_py.py](tests/test_query_py.py).
382+
see also: [test_query_py.py](tests/test_query_py.py) and [test_query_json.py](tests/test_query_json.py).
383+
384+
### JSON Type Inference
385+
386+
chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
387+
388+
1. **Pandas DataFrame**
389+
- Columns with `object` dtype are sampled (default 10,000 rows) to detect JSON structures.
390+
- Control sampling via SQL settings:
391+
```sql
392+
SET pandas_analyze_sample = 10000 -- Default sampling
393+
SET pandas_analyze_sample = 0 -- Force String type
394+
SET pandas_analyze_sample = -1 -- Force JSON type
395+
```
396+
- Columns are converted to `String` if sampling finds non-dictionary values.
397+
398+
2. **Arrow Table**
399+
- `struct` type columns are automatically mapped to JSON columns.
400+
- Nested structures preserve type information.
401+
402+
3. **chdb.PyReader**
403+
- Implement custom schema mapping in `get_schema()`:
404+
```python
405+
def get_schema(self):
406+
return [
407+
("c1", "JSON"), # Explicit JSON mapping
408+
("c2", "String")
409+
]
410+
```
411+
- Column types declared as "JSON" will bypass auto-detection.
412+
413+
When converting Python dictionary objects to JSON columns:
414+
415+
1. **Nested Structures**
416+
- Recursively process nested dictionaries, lists, tuples and NumPy arrays.
417+
418+
2. **Primitive Types**
419+
- Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
420+
421+
3. **Complex Objects**
422+
- Non-primitive types will be converted to strings.
350423

351424
### Limitations
352425

0 commit comments

Comments
 (0)