|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +use arrow::array::{ArrayRef, StringArray}; |
| 19 | +use arrow::util::test_util::seedable_rng; |
| 20 | +use criterion::{criterion_group, criterion_main, Criterion}; |
| 21 | +use parquet_variant::{Variant, VariantBuilder}; |
| 22 | +use parquet_variant_compute::variant_get::{variant_get, GetOptions}; |
| 23 | +use parquet_variant_compute::{batch_json_string_to_variant, VariantArray, VariantArrayBuilder}; |
| 24 | +use rand::distr::Alphanumeric; |
| 25 | +use rand::rngs::StdRng; |
| 26 | +use rand::Rng; |
| 27 | +use rand::SeedableRng; |
| 28 | +use std::fmt::Write; |
| 29 | +use std::sync::Arc; |
| 30 | + |
| 31 | +/// This function generates a vector of JSON strings, each representing a person |
| 32 | +/// with random first name, last name, and age. |
| 33 | +/// |
| 34 | +/// Example: |
| 35 | +/// ```json |
| 36 | +/// { |
| 37 | +/// "first" : random_string_of_1_to_20_characters, |
| 38 | +/// "last" : random_string_of_1_to_20_characters, |
| 39 | +/// "age": random_value_between_20_and_80, |
| 40 | +/// } |
| 41 | +/// ``` |
| 42 | +fn small_repeated_json_structure(count: usize) -> impl Iterator<Item = String> { |
| 43 | + let mut rng = seedable_rng(); |
| 44 | + (0..count).map(move |_| { |
| 45 | + let first: String = (0..rng.random_range(1..=20)) |
| 46 | + .map(|_| rng.sample(Alphanumeric) as char) |
| 47 | + .collect(); |
| 48 | + let last: String = (0..rng.random_range(1..=20)) |
| 49 | + .map(|_| rng.sample(Alphanumeric) as char) |
| 50 | + .collect(); |
| 51 | + let age: u8 = rng.random_range(20..=80); |
| 52 | + format!("{{\"first\":\"{first}\",\"last\":\"{last}\",\"age\":{age}}}") |
| 53 | + }) |
| 54 | +} |
| 55 | + |
| 56 | +/// This function generates a vector of JSON strings which have many fields |
| 57 | +/// and a random structure (including field names) |
| 58 | +fn small_random_json_structure(count: usize) -> impl Iterator<Item = String> { |
| 59 | + let mut generator = RandomJsonGenerator::new(); |
| 60 | + (0..count).map(move |_| generator.next().to_string()) |
| 61 | +} |
| 62 | + |
| 63 | +fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { |
| 64 | + let input_array = StringArray::from_iter_values(small_repeated_json_structure(8000)); |
| 65 | + let array_ref: ArrayRef = Arc::new(input_array); |
| 66 | + c.bench_function( |
| 67 | + "batch_json_string_to_variant small_repeated_json 8k string", |
| 68 | + |b| { |
| 69 | + b.iter(|| { |
| 70 | + let _ = batch_json_string_to_variant(&array_ref).unwrap(); |
| 71 | + }); |
| 72 | + }, |
| 73 | + ); |
| 74 | + |
| 75 | + let input_array = StringArray::from_iter_values(small_random_json_structure(8000)); |
| 76 | + let array_ref: ArrayRef = Arc::new(input_array); |
| 77 | + c.bench_function( |
| 78 | + "batch_json_string_to_variant small_random_json 8k string", |
| 79 | + |b| { |
| 80 | + b.iter(|| { |
| 81 | + let _ = batch_json_string_to_variant(&array_ref).unwrap(); |
| 82 | + }); |
| 83 | + }, |
| 84 | + ); |
| 85 | +} |
| 86 | + |
| 87 | +fn create_primitive_variant(size: usize) -> VariantArray { |
| 88 | + let mut rng = StdRng::seed_from_u64(42); |
| 89 | + |
| 90 | + let mut variant_builder = VariantArrayBuilder::new(1); |
| 91 | + |
| 92 | + for _ in 0..size { |
| 93 | + let mut builder = VariantBuilder::new(); |
| 94 | + builder.append_value(rng.random::<i64>()); |
| 95 | + let (metadata, value) = builder.finish(); |
| 96 | + variant_builder.append_variant(Variant::try_new(&metadata, &value).unwrap()); |
| 97 | + } |
| 98 | + |
| 99 | + variant_builder.build() |
| 100 | +} |
| 101 | + |
| 102 | +pub fn variant_get_bench(c: &mut Criterion) { |
| 103 | + let variant_array = create_primitive_variant(8192); |
| 104 | + let input: ArrayRef = Arc::new(variant_array); |
| 105 | + |
| 106 | + let options = GetOptions { |
| 107 | + path: vec![].into(), |
| 108 | + as_type: None, |
| 109 | + cast_options: Default::default(), |
| 110 | + }; |
| 111 | + |
| 112 | + c.bench_function("variant_get_primitive", |b| { |
| 113 | + b.iter(|| variant_get(&input.clone(), options.clone())) |
| 114 | + }); |
| 115 | +} |
| 116 | + |
| 117 | +criterion_group!( |
| 118 | + benches, |
| 119 | + variant_get_bench, |
| 120 | + benchmark_batch_json_string_to_variant |
| 121 | +); |
| 122 | +criterion_main!(benches); |
| 123 | + |
| 124 | +/// Creates JSON with random structure and fields. |
| 125 | +/// |
| 126 | +/// Each type is created in a random proportion, controlled by the |
| 127 | +/// probabilities. The sum of all probabilities should be 1.0. |
| 128 | +struct RandomJsonGenerator { |
| 129 | + /// Random number generator |
| 130 | + rng: StdRng, |
| 131 | + /// the probability of generating a null value |
| 132 | + null_probability: f64, |
| 133 | + /// the probably of generating a string value |
| 134 | + string_probability: f64, |
| 135 | + /// the probably of generating a number value |
| 136 | + number_probability: f64, |
| 137 | + /// the probably of generating a boolean value |
| 138 | + boolean_probability: f64, |
| 139 | + /// the probably of generating an object value |
| 140 | + object_probability: f64, |
| 141 | + // probability of generating a JSON array is the remaining probability |
| 142 | + /// The maximum depth of the generated JSON structure |
| 143 | + max_depth: usize, |
| 144 | + /// output buffer |
| 145 | + output_buffer: String, |
| 146 | +} |
| 147 | + |
| 148 | +impl RandomJsonGenerator { |
| 149 | + fn new() -> Self { |
| 150 | + let rng = seedable_rng(); |
| 151 | + Self { |
| 152 | + rng, |
| 153 | + null_probability: 0.05, |
| 154 | + string_probability: 0.25, |
| 155 | + number_probability: 0.25, |
| 156 | + boolean_probability: 0.10, |
| 157 | + object_probability: 0.10, |
| 158 | + max_depth: 5, |
| 159 | + output_buffer: String::new(), |
| 160 | + } |
| 161 | + } |
| 162 | + |
| 163 | + fn next(&mut self) -> &str { |
| 164 | + self.output_buffer.clear(); |
| 165 | + self.append_random_json(0); |
| 166 | + &self.output_buffer |
| 167 | + } |
| 168 | + |
| 169 | + /// Appends a random JSON value to the output buffer. |
| 170 | + fn append_random_json(&mut self, current_depth: usize) { |
| 171 | + if current_depth >= self.max_depth { |
| 172 | + write!(&mut self.output_buffer, "\"max_depth reached\"").unwrap(); |
| 173 | + return; |
| 174 | + } |
| 175 | + // Generate a random number to determine the type |
| 176 | + let random_value: f64 = self.rng.random(); |
| 177 | + if random_value < self.null_probability { |
| 178 | + write!(&mut self.output_buffer, "null").unwrap(); |
| 179 | + } else if random_value < self.null_probability + self.string_probability { |
| 180 | + // Generate a random string between 1 and 500 characters |
| 181 | + let length = self.rng.random_range(1..=500); |
| 182 | + let random_string: String = (0..length) |
| 183 | + .map(|_| self.rng.sample(Alphanumeric) as char) |
| 184 | + .collect(); |
| 185 | + write!(&mut self.output_buffer, "\"{random_string}\"",).unwrap(); |
| 186 | + } else if random_value |
| 187 | + < self.null_probability + self.string_probability + self.number_probability |
| 188 | + { |
| 189 | + // Generate a random number |
| 190 | + let random_number: f64 = self.rng.random_range(-1000.0..1000.0); |
| 191 | + write!(&mut self.output_buffer, "{random_number}",).unwrap(); |
| 192 | + } else if random_value |
| 193 | + < self.null_probability |
| 194 | + + self.string_probability |
| 195 | + + self.number_probability |
| 196 | + + self.boolean_probability |
| 197 | + { |
| 198 | + // Generate a random boolean |
| 199 | + let random_boolean: bool = self.rng.random(); |
| 200 | + write!(&mut self.output_buffer, "{random_boolean}",).unwrap(); |
| 201 | + } else if random_value |
| 202 | + < self.null_probability |
| 203 | + + self.string_probability |
| 204 | + + self.number_probability |
| 205 | + + self.boolean_probability |
| 206 | + + self.object_probability |
| 207 | + { |
| 208 | + // Generate a random object |
| 209 | + let num_fields = self.rng.random_range(1..=10); |
| 210 | + |
| 211 | + write!(&mut self.output_buffer, "{{").unwrap(); |
| 212 | + for i in 0..num_fields { |
| 213 | + let key_length = self.rng.random_range(1..=20); |
| 214 | + let key: String = (0..key_length) |
| 215 | + .map(|_| self.rng.sample(Alphanumeric) as char) |
| 216 | + .collect(); |
| 217 | + write!(&mut self.output_buffer, "\"{key}\":").unwrap(); |
| 218 | + self.append_random_json(current_depth + 1); |
| 219 | + if i < num_fields - 1 { |
| 220 | + write!(&mut self.output_buffer, ",").unwrap(); |
| 221 | + } |
| 222 | + } |
| 223 | + write!(&mut self.output_buffer, "}}").unwrap(); |
| 224 | + } else { |
| 225 | + // Generate a random array |
| 226 | + let length = self.rng.random_range(1..=10); |
| 227 | + write!(&mut self.output_buffer, "[").unwrap(); |
| 228 | + for i in 0..length { |
| 229 | + self.append_random_json(current_depth + 1); |
| 230 | + if i < length - 1 { |
| 231 | + write!(&mut self.output_buffer, ",").unwrap(); |
| 232 | + } |
| 233 | + } |
| 234 | + write!(&mut self.output_buffer, "]").unwrap(); |
| 235 | + } |
| 236 | + } |
| 237 | +} |
0 commit comments