1818
1919
2020> chDB is an in-process SQL OLAP Engine powered by ClickHouse [ ^ 1 ]
21- > For more details: [ The birth of chDB] ( https://auxten.com/the-birth-of-chdb/ )
21+ > For more details: [ The birth of chDB] ( https://auxten.com/the-birth-of-chdb/ )
2222
2323
2424## Features
25-
25+
2626* In-process SQL OLAP Engine, powered by ClickHouse
2727* No need to install ClickHouse
2828* Minimized data copy from C++ to Python with [ python memoryview] ( https://docs.python.org/3/c-api/memoryview.html )
@@ -113,7 +113,7 @@ res = chdb.query('select version()', 'Pretty'); print(res)
113113# See more data type format in tests/format_output.py
114114res = chdb.query(' select * from file("data.parquet", Parquet)' , ' JSON' ); print (res)
115115res = chdb.query(' select * from file("data.csv", CSV)' , ' CSV' ); print (res)
116- print (f " SQL read { res.rows_read()} rows, { res.bytes_read()} bytes, elapsed { res.elapsed()} seconds " )
116+ print (f " SQL read { res.rows_read()} rows, { res.bytes_read()} bytes, storage read { res.storage_rows_read() } rows, { res.storage_bytes_read() } bytes, elapsed { res.elapsed()} seconds " )
117117```
118118
119119### Pandas dataframe output
@@ -138,6 +138,8 @@ ret_tbl = cdf.query(sql="select * from __tbl1__ t1 join __tbl2__ t2 on t1.a = t2
138138print (ret_tbl)
139139# Query on the DataFrame Table
140140print (ret_tbl.query(' select b, sum(a) from __table__ group by b' ))
141+ # Pandas DataFrames are automatically registered as temporary tables in ClickHouse
142+ chdb.query(" SELECT * FROM Python(df1) t1 JOIN Python(df2) t2 ON t1.a = t2.c" ).show()
141143```
142144</details >
143145
@@ -285,10 +287,19 @@ df = pd.DataFrame(
285287 {
286288 " a" : [1 , 2 , 3 , 4 , 5 , 6 ],
287289 " b" : [" tom" , " jerry" , " auxten" , " tom" , " jerry" , " auxten" ],
290+ " dict_col" : [
291+ {' id' : 1 , ' tags' : [' urgent' , ' important' ], ' metadata' : {' created' : ' 2024-01-01' }},
292+ {' id' : 2 , ' tags' : [' normal' ], ' metadata' : {' created' : ' 2024-02-01' }},
293+ {' id' : 3 , ' name' : ' tom' },
294+ {' id' : 4 , ' value' : ' 100' },
295+ {' id' : 5 , ' value' : 101 },
296+ {' id' : 6 , ' value' : 102 },
297+ ],
288298 }
289299)
290300
291301chdb.query(" SELECT b, sum(a) FROM Python(df) GROUP BY b ORDER BY b" ).show()
302+ chdb.query(" SELECT dict_col.id FROM Python(df) WHERE dict_col.value='100'" ).show()
292303```
293304
294305### Query on Arrow Table
@@ -300,12 +311,19 @@ arrow_table = pa.table(
300311 {
301312 " a" : [1 , 2 , 3 , 4 , 5 , 6 ],
302313 " b" : [" tom" , " jerry" , " auxten" , " tom" , " jerry" , " auxten" ],
314+ " dict_col" : [
315+ {' id' : 1 , ' value' : ' tom' },
316+ {' id' : 2 , ' value' : ' jerry' },
317+ {' id' : 3 , ' value' : ' auxten' },
318+ {' id' : 4 , ' value' : ' tom' },
319+ {' id' : 5 , ' value' : ' jerry' },
320+ {' id' : 6 , ' value' : ' auxten' },
321+ ],
303322 }
304323)
305324
306- chdb.query(
307- " SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b" , " debug"
308- ).show()
325+ chdb.query(" SELECT b, sum(a) FROM Python(arrow_table) GROUP BY b ORDER BY b" ).show()
326+ chdb.query(" SELECT dict_col.id FROM Python(arrow_table) WHERE dict_col.value='tom'" ).show()
309327```
310328
311329### Query on chdb.PyReader class instance
@@ -329,24 +347,79 @@ class myReader(chdb.PyReader):
329347 def read (self , col_names , count ):
330348 print (" Python func read" , col_names, count, self .cursor)
331349 if self .cursor >= len (self .data[" a" ]):
350+ self .cursor = 0
332351 return []
333352 block = [self .data[col] for col in col_names]
334353 self .cursor += len (block[0 ])
335354 return block
336355
356+ def get_schema (self ):
357+ return [
358+ (" a" , " int" ),
359+ (" b" , " str" ),
360+ (" dict_col" , " json" )
361+ ]
362+
337363reader = myReader(
338364 {
339365 " a" : [1 , 2 , 3 , 4 , 5 , 6 ],
340366 " b" : [" tom" , " jerry" , " auxten" , " tom" , " jerry" , " auxten" ],
367+ " dict_col" : [
368+ {' id' : 1 , ' tags' : [' urgent' , ' important' ], ' metadata' : {' created' : ' 2024-01-01' }},
369+ {' id' : 2 , ' tags' : [' normal' ], ' metadata' : {' created' : ' 2024-02-01' }},
370+ {' id' : 3 , ' name' : ' tom' },
371+ {' id' : 4 , ' value' : ' 100' },
372+ {' id' : 5 , ' value' : 101 },
373+ {' id' : 6 , ' value' : 102 }
374+ ],
341375 }
342376)
343377
344- chdb.query(
345- " SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b"
346- ).show()
378+ chdb.query(" SELECT b, sum(a) FROM Python(reader) GROUP BY b ORDER BY b" ).show()
379+ chdb.query(" SELECT dict_col.id FROM Python(reader) WHERE dict_col.value='100'" ).show()
347380```
348381
349- see also: [ test_query_py.py] ( tests/test_query_py.py ) .
382+ see also: [ test_query_py.py] ( tests/test_query_py.py ) and [ test_query_json.py] ( tests/test_query_json.py ) .
383+
384+ ### JSON Type Inference
385+
386+ chDB automatically converts Python dictionary objects to ClickHouse JSON types from these sources:
387+
388+ 1 . ** Pandas DataFrame**
389+ - Columns with ` object ` dtype are sampled (default 10,000 rows) to detect JSON structures.
390+ - Control sampling via SQL settings:
391+ ``` sql
392+ SET pandas_analyze_sample = 10000 -- Default sampling
393+ SET pandas_analyze_sample = 0 -- Force String type
394+ SET pandas_analyze_sample = - 1 -- Force JSON type
395+ ```
396+ - Columns are converted to ` String` if sampling finds non- dictionary values .
397+
398+ 2 . ** Arrow Table**
399+ - ` struct` type columns are automatically mapped to JSON columns.
400+ - Nested structures preserve type information.
401+
402+ 3 . ** chdb .PyReader **
403+ - Implement custom schema mapping in ` get_schema()` :
404+ ` ` ` python
405+ def get_schema(self):
406+ return [
407+ ("c1", "JSON"), # Explicit JSON mapping
408+ ("c2", "String")
409+ ]
410+ ` ` `
411+ - Column types declared as " JSON" will bypass auto- detection.
412+
413+ When converting Python dictionary objects to JSON columns:
414+
415+ 1 . ** Nested Structures**
416+ - Recursively process nested dictionaries, lists, tuples and NumPy arrays.
417+
418+ 2 . ** Primitive Types**
419+ - Automatic type recognition for basic types such as integers, floats, strings, and booleans, and more.
420+
421+ 3 . ** Complex Objects**
422+ - Non- primitive types will be converted to strings.
350423
351424# ## Limitations
352425
0 commit comments