13
13
14
14
Reduce dataset to selected columns, optionally save it under a different name.
15
15
"""
16
- function dselect (
17
- dInfo:: Dinfo ,
18
- columns:: Vector{Int} ,
19
- tgt:: Symbol = dInfo. val,
20
- ):: Dinfo
16
+ function dselect (dInfo:: Dinfo , columns:: Vector{Int} , tgt:: Symbol = dInfo. val):: Dinfo
21
17
dtransform (dInfo, mtx -> mtx[:, columns], tgt)
22
18
end
23
19
91
87
Compute mean and standard deviation of the columns in dataset. Returns a tuple
92
88
with a vector of means in `columns`, and a vector of corresponding sdevs.
93
89
"""
94
- function dstat (
95
- dInfo:: Dinfo ,
96
- columns:: Vector{Int} ,
97
- ):: Tuple{Vector{Float64},Vector{Float64}}
90
+ function dstat (dInfo:: Dinfo , columns:: Vector{Int} ):: Tuple{Vector{Float64},Vector{Float64}}
98
91
99
92
sum_squares = x -> sum (x .^ 2 )
100
93
@@ -136,8 +129,7 @@ function dstat_buckets(
136
129
)
137
130
138
131
# extract the bucketed stats
139
- (sums, sqsums, ns) =
140
- dmapreduce ([dInfo, buckets], get_bucketed_stats, combine_stats)
132
+ (sums, sqsums, ns) = dmapreduce ([dInfo, buckets], get_bucketed_stats, combine_stats)
141
133
142
134
return (
143
135
sums ./ ns, # means
@@ -285,7 +277,8 @@ less or higher than `targets`.
285
277
"""
286
278
function update_extrema (counts, targets, lims, mids)
287
279
broadcast (
288
- (cnt, target, lim, mid) -> cnt >= target ? # if the count is too high,
280
+ (cnt, target, lim, mid) ->
281
+ cnt >= target ? # if the count is too high,
289
282
(lim[1 ], mid) : # median is going to be in the lower half
290
283
(mid, lim[2 ]), # otherwise in the higher half
291
284
counts,
@@ -313,11 +306,8 @@ function dmedian(dInfo::Dinfo, columns::Vector{Int}; iters = 20)
313
306
target = dmapreduce (dInfo, d -> size (d, 1 ), + ) ./ 2
314
307
315
308
# current estimation range for the median (tuples of min, max)
316
- lims = dmapreduce (
317
- dInfo,
318
- d -> mapslices (extrema, d[:, columns], dims = 1 ),
319
- reduce_extrema,
320
- )
309
+ lims =
310
+ dmapreduce (dInfo, d -> mapslices (extrema, d[:, columns], dims = 1 ), reduce_extrema)
321
311
322
312
# convert the limits to a simple vector
323
313
lims = cat (lims... , dims = 1 )
@@ -368,8 +358,8 @@ function dmedian_buckets(
368
358
get_bucket_extrema =
369
359
(d, b) -> catmapbuckets (
370
360
(_, x) -> length (x) > 0 ? # if there are some elements
371
- extrema (x) : # just take the extrema
372
- (Inf , - Inf ), # if not, use backup values
361
+ extrema (x) : # just take the extrema
362
+ (Inf , - Inf ), # if not, use backup values
373
363
d[:, columns],
374
364
nbuckets,
375
365
b,
@@ -384,21 +374,22 @@ function dmedian_buckets(
384
374
# this counts the elements smaller than mids in buckets
385
375
# (both mids and elements are bucketed and column-sliced into matrices)
386
376
bucketed_count_smaller_than_mids =
387
- (d, b) -> vcat (mapbuckets (
388
- (bucketID, d) ->
389
- [
390
- count (x -> x < mids[bucketID, colID], d[:, colID])
391
- for (colID, c) in enumerate (columns)
392
- ]' ,
393
- d,
394
- nbuckets,
395
- b,
396
- slicedims = (1 , 2 ),
397
- )... )
377
+ (d, b) -> vcat (
378
+ mapbuckets (
379
+ (bucketID, d) ->
380
+ [
381
+ count (x -> x < mids[bucketID, colID], d[:, colID]) for
382
+ (colID, c) in enumerate (columns)
383
+ ]' ,
384
+ d,
385
+ nbuckets,
386
+ b,
387
+ slicedims = (1 , 2 ),
388
+ )... ,
389
+ )
398
390
399
391
# gather the counts
400
- counts =
401
- dmapreduce ([dInfo, buckets], bucketed_count_smaller_than_mids, + )
392
+ counts = dmapreduce ([dInfo, buckets], bucketed_count_smaller_than_mids, + )
402
393
403
394
lims = update_extrema (counts, targets, lims, mids)
404
395
end
0 commit comments