bitly · mira-miracoli · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024 · Jan 16, 2024
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
@@ -34,6 +34,7 @@
 
 class MVSD(object):
     "A class that calculates a running Mean / Variance / Standard Deviation"
+
     def __init__(self):
         self.is_started = False
         self.ss = Decimal(0)  # (running) sum of square deviations from mean
@@ -51,8 +52,7 @@ def add(self, x, w=1):
             self.is_started = True
         else:
             temp_w = self.total_w + w
-            self.ss += (self.total_w * w * (x - self.m) *
-                        (x - self.m)) / temp_w
+            self.ss += (self.total_w * w * (x - self.m) * (x - self.m)) / temp_w
             self.m += (x - self.m) / temp_w
             self.total_w = temp_w
 
@@ -65,17 +65,18 @@ def sd(self):
     def mean(self):
         return self.m
 
-DataPoint = namedtuple('DataPoint', ['value', 'count'])
+
+DataPoint = namedtuple("DataPoint", ["value", "count"])
 
 
 def test_mvsd():
     mvsd = MVSD()
     for x in range(10):
         mvsd.add(x)
 
-    assert '%.2f' % mvsd.mean() == "4.50"
-    assert '%.2f' % mvsd.var() == "8.25"
-    assert '%.14f' % mvsd.sd() == "2.87228132326901"
+    assert "%.2f" % mvsd.mean() == "4.50"
+    assert "%.2f" % mvsd.var() == "8.25"
+    assert "%.14f" % mvsd.sd() == "2.87228132326901"
 
 
 def load_stream(input_stream, agg_value_key, agg_key_value):
@@ -96,22 +97,21 @@ def load_stream(input_stream, agg_value_key, agg_key_value):
             else:
                 yield DataPoint(Decimal(clean_line), 1)
         except:
-            logging.exception('failed %r', line)
-            print >>sys.stderr, "invalid line %r" % line
+            logging.exception("failed %r", line)
+            print(sys.stderr, "invalid line %r" % line)
 
 
 def median(values, key=None):
     if not key:
         key = None  # map and sort accept None as identity
     length = len(values)
     if length % 2:
-        median_indeces = [length/2]
+        median_indeces = [round(length / 2)]
     else:
-        median_indeces = [length/2-1, length/2]
+        median_indeces = [round(length / 2 - 1), round(length / 2)]
 
     values = sorted(values, key=key)
-    return sum(map(key,
-                   [values[i] for i in median_indeces])) / len(median_indeces)
+    return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)
 
 
 def test_median():
@@ -147,15 +147,15 @@ def histogram(stream, options):
         max_v = max_v.value
 
     if not max_v > min_v:
-        raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
+        raise ValueError("max must be > min. max:%s min:%s" % (max_v, min_v))
     diff = max_v - min_v
 
     boundaries = []
     bucket_counts = []
     buckets = 0
 
     if options.custbuckets:
-        bound = options.custbuckets.split(',')
+        bound = options.custbuckets.split(",")
         bound_sort = sorted(map(Decimal, bound))
 
         # if the last value is smaller than the maximum, replace it
@@ -177,7 +177,7 @@ def histogram(stream, options):
     elif options.logscale:
         buckets = options.buckets and int(options.buckets) or 10
         if buckets <= 0:
-            raise ValueError('# of buckets must be > 0')
+            raise ValueError("# of buckets must be > 0")
 
         def first_bucket_size(k, n):
             """Logarithmic buckets means, the size of bucket i+1 is twice
@@ -189,22 +189,23 @@ def first_bucket_size(k, n):
                 x * (2^{k+1} - 1)      = n
                 x = n/(2^{k+1} - 1)
             """
-            return n/(2**(k+1)-1)
+            return n / (2 ** (k + 1) - 1)
 
         def log_steps(k, n):
             "k logarithmic steps whose sum is n"
-            x = first_bucket_size(k-1, n)
+            x = first_bucket_size(k - 1, n)
             sum = 0
             for i in range(k):
                 sum += 2**i * x
                 yield sum
+
         bucket_counts = [0 for x in range(buckets)]
         for step in log_steps(buckets, diff):
             boundaries.append(min_v + step)
     else:
         buckets = options.buckets and int(options.buckets) or 10
         if buckets <= 0:
-            raise ValueError('# of buckets must be > 0')
+            raise ValueError("# of buckets must be > 0")
         step = diff / buckets
         bucket_counts = [0 for x in range(buckets)]
         for x in range(buckets):
@@ -232,69 +233,110 @@ def log_steps(k, n):
     if max(bucket_counts) > 75:
         bucket_scale = int(max(bucket_counts) / 75)
 
-    print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" %
-          (samples, min_v, max_v))
+    print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v))
     if skipped:
-        print("# %d value%s outside of min/max" %
-              (skipped, skipped > 1 and 's' or ''))
+        print("# %d value%s outside of min/max" % (skipped, skipped > 1 and "s" or ""))
     if options.mvsd:
-        print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
-              (mvsd.mean(), mvsd.var(), mvsd.sd(),
-               median(accepted_data, key=lambda x: x.value)))
-    print "# each " + options.dot + " represents a count of %d" % bucket_scale
+        print(
+            "# Mean = %f; Variance = %f; SD = %f; Median %f"
+            % (
+                mvsd.mean(),
+                mvsd.var(),
+                mvsd.sd(),
+                median(accepted_data, key=lambda x: x.value),
+            )
+        )
+    print("# each " + options.dot + " represents a count of %d" % bucket_scale)
     bucket_min = min_v
     bucket_max = min_v
     percentage = ""
-    format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
+    format_string = options.format + " - " + options.format + " [%6d]: %s%s"
     for bucket in range(buckets):
         bucket_min = bucket_max
         bucket_max = boundaries[bucket]
         bucket_count = bucket_counts[bucket]
         star_count = 0
         if bucket_count:
-            star_count = bucket_count / bucket_scale
+            star_count = round(bucket_count / bucket_scale)
         if options.percentage:
-            percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
-                                         Decimal(samples))
-        print format_string % (bucket_min, bucket_max, bucket_count, options.dot *
-                               star_count, percentage)
+            percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
+        print(
+            f"{bucket_min} - {bucket_max} [{bucket_count}] {options.dot * star_count} {percentage}"
+        )
 
 
 if __name__ == "__main__":
     parser = OptionParser()
     parser.usage = "cat data | %prog [options]"
-    parser.add_option("-a", "--agg", dest="agg_value_key", default=False,
-                      action="store_true", help="Two column input format, " +
-                      "space seperated with value<space>key")
-    parser.add_option("-A", "--agg-key-value", dest="agg_key_value",
-                      default=False, action="store_true", help="Two column " +
-                      "input format, space seperated with key<space>value")
-    parser.add_option("-m", "--min", dest="min",
-                      help="minimum value for graph")
-    parser.add_option("-x", "--max", dest="max",
-                      help="maximum value for graph")
-    parser.add_option("-b", "--buckets", dest="buckets",
-                      help="Number of buckets to use for the histogram")
-    parser.add_option("-l", "--logscale", dest="logscale", default=False,
-                      action="store_true",
-                      help="Buckets grow in logarithmic scale")
-    parser.add_option("-B", "--custom-buckets", dest="custbuckets",
-                      help="Comma seperated list of bucket " +
-                      "edges for the histogram")
-    parser.add_option("--no-mvsd", dest="mvsd", action="store_false",
-                      default=True, help="Disable the calculation of Mean, " +
-                      "Variance and SD (improves performance)")
-    parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
-                      help="format for bucket numbers")
-    parser.add_option("-p", "--percentage", dest="percentage", default=False,
-                      action="store_true", help="List percentage for each bar")
-    parser.add_option("--dot", dest="dot", default='∎', help="Dot representation")
+    parser.add_option(
+        "-a",
+        "--agg",
+        dest="agg_value_key",
+        default=False,
+        action="store_true",
+        help="Two column input format, " + "space seperated with value<space>key",
+    )
+    parser.add_option(
+        "-A",
+        "--agg-key-value",
+        dest="agg_key_value",
+        default=False,
+        action="store_true",
+        help="Two column " + "input format, space seperated with key<space>value",
+    )
+    parser.add_option("-m", "--min", dest="min", help="minimum value for graph")
+    parser.add_option("-x", "--max", dest="max", help="maximum value for graph")
+    parser.add_option(
+        "-b",
+        "--buckets",
+        dest="buckets",
+        help="Number of buckets to use for the histogram",
+    )
+    parser.add_option(
+        "-l",
+        "--logscale",
+        dest="logscale",
+        default=False,
+        action="store_true",
+        help="Buckets grow in logarithmic scale",
+    )
+    parser.add_option(
+        "-B",
+        "--custom-buckets",
+        dest="custbuckets",
+        help="Comma seperated list of bucket " + "edges for the histogram",
+    )
+    parser.add_option(
+        "--no-mvsd",
+        dest="mvsd",
+        action="store_false",
+        default=True,
+        help="Disable the calculation of Mean, "
+        + "Variance and SD (improves performance)",
+    )
+    parser.add_option(
+        "-f",
+        "--bucket-format",
+        dest="format",
+        default="%10.4f",
+        help="format for bucket numbers",
+    )
+    parser.add_option(
+        "-p",
+        "--percentage",
+        dest="percentage",
+        default=False,
+        action="store_true",
+        help="List percentage for each bar",
+    )
+    parser.add_option("--dot", dest="dot", default="∎", help="Dot representation")
 
     (options, args) = parser.parse_args()
     if sys.stdin.isatty():
         # if isatty() that means it's run without anything piped into it
         parser.print_usage()
-        print "for more help use --help"
+        print("for more help use --help")
         sys.exit(1)
-    histogram(load_stream(sys.stdin, options.agg_value_key,
-                          options.agg_key_value), options)
+    histogram(
+        load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options
+    )