diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 3d16cc8..6b3e941 100755 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -34,6 +34,7 @@ class MVSD(object): "A class that calculates a running Mean / Variance / Standard Deviation" + def __init__(self): self.is_started = False self.ss = Decimal(0) # (running) sum of square deviations from mean @@ -51,8 +52,7 @@ def add(self, x, w=1): self.is_started = True else: temp_w = self.total_w + w - self.ss += (self.total_w * w * (x - self.m) * - (x - self.m)) / temp_w + self.ss += (self.total_w * w * (x - self.m) * (x - self.m)) / temp_w self.m += (x - self.m) / temp_w self.total_w = temp_w @@ -65,7 +65,8 @@ def sd(self): def mean(self): return self.m -DataPoint = namedtuple('DataPoint', ['value', 'count']) + +DataPoint = namedtuple("DataPoint", ["value", "count"]) def test_mvsd(): @@ -73,9 +74,9 @@ def test_mvsd(): for x in range(10): mvsd.add(x) - assert '%.2f' % mvsd.mean() == "4.50" - assert '%.2f' % mvsd.var() == "8.25" - assert '%.14f' % mvsd.sd() == "2.87228132326901" + assert "%.2f" % mvsd.mean() == "4.50" + assert "%.2f" % mvsd.var() == "8.25" + assert "%.14f" % mvsd.sd() == "2.87228132326901" def load_stream(input_stream, agg_value_key, agg_key_value): @@ -96,8 +97,8 @@ def load_stream(input_stream, agg_value_key, agg_key_value): else: yield DataPoint(Decimal(clean_line), 1) except: - logging.exception('failed %r', line) - print >>sys.stderr, "invalid line %r" % line + logging.exception("failed %r", line) + print(sys.stderr, "invalid line %r" % line) def median(values, key=None): @@ -105,13 +106,12 @@ def median(values, key=None): key = None # map and sort accept None as identity length = len(values) if length % 2: - median_indeces = [length/2] + median_indeces = [round(length / 2)] else: - median_indeces = [length/2-1, length/2] + median_indeces = [round(length / 2 - 1), round(length / 2)] values = sorted(values, key=key) - return sum(map(key, - [values[i] for i in median_indeces])) / len(median_indeces) + return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces) def test_median(): @@ -147,7 +147,7 @@ def histogram(stream, options): max_v = max_v.value if not max_v > min_v: - raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v)) + raise ValueError("max must be > min. max:%s min:%s" % (max_v, min_v)) diff = max_v - min_v boundaries = [] @@ -155,7 +155,7 @@ def histogram(stream, options): buckets = 0 if options.custbuckets: - bound = options.custbuckets.split(',') + bound = options.custbuckets.split(",") bound_sort = sorted(map(Decimal, bound)) # if the last value is smaller than the maximum, replace it @@ -177,7 +177,7 @@ def histogram(stream, options): elif options.logscale: buckets = options.buckets and int(options.buckets) or 10 if buckets <= 0: - raise ValueError('# of buckets must be > 0') + raise ValueError("# of buckets must be > 0") def first_bucket_size(k, n): """Logarithmic buckets means, the size of bucket i+1 is twice @@ -189,22 +189,23 @@ def first_bucket_size(k, n): x * (2^{k+1} - 1) = n x = n/(2^{k+1} - 1) """ - return n/(2**(k+1)-1) + return n / (2 ** (k + 1) - 1) def log_steps(k, n): "k logarithmic steps whose sum is n" - x = first_bucket_size(k-1, n) + x = first_bucket_size(k - 1, n) sum = 0 for i in range(k): sum += 2**i * x yield sum + bucket_counts = [0 for x in range(buckets)] for step in log_steps(buckets, diff): boundaries.append(min_v + step) else: buckets = options.buckets and int(options.buckets) or 10 if buckets <= 0: - raise ValueError('# of buckets must be > 0') + raise ValueError("# of buckets must be > 0") step = diff / buckets bucket_counts = [0 for x in range(buckets)] for x in range(buckets): @@ -232,69 +233,110 @@ def log_steps(k, n): if max(bucket_counts) > 75: bucket_scale = int(max(bucket_counts) / 75) - print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % - (samples, min_v, max_v)) + print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v)) if skipped: - print("# %d value%s outside of min/max" % - (skipped, skipped > 1 and 's' or '')) + print("# %d value%s outside of min/max" % (skipped, skipped > 1 and "s" or "")) if options.mvsd: - print("# Mean = %f; Variance = %f; SD = %f; Median %f" % - (mvsd.mean(), mvsd.var(), mvsd.sd(), - median(accepted_data, key=lambda x: x.value))) - print "# each " + options.dot + " represents a count of %d" % bucket_scale + print( + "# Mean = %f; Variance = %f; SD = %f; Median %f" + % ( + mvsd.mean(), + mvsd.var(), + mvsd.sd(), + median(accepted_data, key=lambda x: x.value), + ) + ) + print("# each " + options.dot + " represents a count of %d" % bucket_scale) bucket_min = min_v bucket_max = min_v percentage = "" - format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s' + format_string = options.format + " - " + options.format + " [%6d]: %s%s" for bucket in range(buckets): bucket_min = bucket_max bucket_max = boundaries[bucket] bucket_count = bucket_counts[bucket] star_count = 0 if bucket_count: - star_count = bucket_count / bucket_scale + star_count = round(bucket_count / bucket_scale) if options.percentage: - percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / - Decimal(samples)) - print format_string % (bucket_min, bucket_max, bucket_count, options.dot * - star_count, percentage) + percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples)) + print( + f"{bucket_min} - {bucket_max} [{bucket_count}] {options.dot * star_count} {percentage}" + ) if __name__ == "__main__": parser = OptionParser() parser.usage = "cat data | %prog [options]" - parser.add_option("-a", "--agg", dest="agg_value_key", default=False, - action="store_true", help="Two column input format, " + - "space seperated with valuekey") - parser.add_option("-A", "--agg-key-value", dest="agg_key_value", - default=False, action="store_true", help="Two column " + - "input format, space seperated with keyvalue") - parser.add_option("-m", "--min", dest="min", - help="minimum value for graph") - parser.add_option("-x", "--max", dest="max", - help="maximum value for graph") - parser.add_option("-b", "--buckets", dest="buckets", - help="Number of buckets to use for the histogram") - parser.add_option("-l", "--logscale", dest="logscale", default=False, - action="store_true", - help="Buckets grow in logarithmic scale") - parser.add_option("-B", "--custom-buckets", dest="custbuckets", - help="Comma seperated list of bucket " + - "edges for the histogram") - parser.add_option("--no-mvsd", dest="mvsd", action="store_false", - default=True, help="Disable the calculation of Mean, " + - "Variance and SD (improves performance)") - parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f", - help="format for bucket numbers") - parser.add_option("-p", "--percentage", dest="percentage", default=False, - action="store_true", help="List percentage for each bar") - parser.add_option("--dot", dest="dot", default='∎', help="Dot representation") + parser.add_option( + "-a", + "--agg", + dest="agg_value_key", + default=False, + action="store_true", + help="Two column input format, " + "space seperated with valuekey", + ) + parser.add_option( + "-A", + "--agg-key-value", + dest="agg_key_value", + default=False, + action="store_true", + help="Two column " + "input format, space seperated with keyvalue", + ) + parser.add_option("-m", "--min", dest="min", help="minimum value for graph") + parser.add_option("-x", "--max", dest="max", help="maximum value for graph") + parser.add_option( + "-b", + "--buckets", + dest="buckets", + help="Number of buckets to use for the histogram", + ) + parser.add_option( + "-l", + "--logscale", + dest="logscale", + default=False, + action="store_true", + help="Buckets grow in logarithmic scale", + ) + parser.add_option( + "-B", + "--custom-buckets", + dest="custbuckets", + help="Comma seperated list of bucket " + "edges for the histogram", + ) + parser.add_option( + "--no-mvsd", + dest="mvsd", + action="store_false", + default=True, + help="Disable the calculation of Mean, " + + "Variance and SD (improves performance)", + ) + parser.add_option( + "-f", + "--bucket-format", + dest="format", + default="%10.4f", + help="format for bucket numbers", + ) + parser.add_option( + "-p", + "--percentage", + dest="percentage", + default=False, + action="store_true", + help="List percentage for each bar", + ) + parser.add_option("--dot", dest="dot", default="∎", help="Dot representation") (options, args) = parser.parse_args() if sys.stdin.isatty(): # if isatty() that means it's run without anything piped into it parser.print_usage() - print "for more help use --help" + print("for more help use --help") sys.exit(1) - histogram(load_stream(sys.stdin, options.agg_value_key, - options.agg_key_value), options) + histogram( + load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options + )