Skip to content

Update histogram.py to python 3 formatting #46

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 103 additions & 61 deletions data_hacks/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

class MVSD(object):
"A class that calculates a running Mean / Variance / Standard Deviation"

def __init__(self):
self.is_started = False
self.ss = Decimal(0) # (running) sum of square deviations from mean
Expand All @@ -51,8 +52,7 @@ def add(self, x, w=1):
self.is_started = True
else:
temp_w = self.total_w + w
self.ss += (self.total_w * w * (x - self.m) *
(x - self.m)) / temp_w
self.ss += (self.total_w * w * (x - self.m) * (x - self.m)) / temp_w
self.m += (x - self.m) / temp_w
self.total_w = temp_w

Expand All @@ -65,17 +65,18 @@ def sd(self):
def mean(self):
return self.m

DataPoint = namedtuple('DataPoint', ['value', 'count'])

DataPoint = namedtuple("DataPoint", ["value", "count"])


def test_mvsd():
mvsd = MVSD()
for x in range(10):
mvsd.add(x)

assert '%.2f' % mvsd.mean() == "4.50"
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"
assert "%.2f" % mvsd.mean() == "4.50"
assert "%.2f" % mvsd.var() == "8.25"
assert "%.14f" % mvsd.sd() == "2.87228132326901"


def load_stream(input_stream, agg_value_key, agg_key_value):
Expand All @@ -96,22 +97,21 @@ def load_stream(input_stream, agg_value_key, agg_key_value):
else:
yield DataPoint(Decimal(clean_line), 1)
except:
logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line
logging.exception("failed %r", line)
print(sys.stderr, "invalid line %r" % line)


def median(values, key=None):
if not key:
key = None # map and sort accept None as identity
length = len(values)
if length % 2:
median_indeces = [length/2]
median_indeces = [round(length / 2)]
else:
median_indeces = [length/2-1, length/2]
median_indeces = [round(length / 2 - 1), round(length / 2)]

values = sorted(values, key=key)
return sum(map(key,
[values[i] for i in median_indeces])) / len(median_indeces)
return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)


def test_median():
Expand Down Expand Up @@ -147,15 +147,15 @@ def histogram(stream, options):
max_v = max_v.value

if not max_v > min_v:
raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
raise ValueError("max must be > min. max:%s min:%s" % (max_v, min_v))
diff = max_v - min_v

boundaries = []
bucket_counts = []
buckets = 0

if options.custbuckets:
bound = options.custbuckets.split(',')
bound = options.custbuckets.split(",")
bound_sort = sorted(map(Decimal, bound))

# if the last value is smaller than the maximum, replace it
Expand All @@ -177,7 +177,7 @@ def histogram(stream, options):
elif options.logscale:
buckets = options.buckets and int(options.buckets) or 10
if buckets <= 0:
raise ValueError('# of buckets must be > 0')
raise ValueError("# of buckets must be > 0")

def first_bucket_size(k, n):
"""Logarithmic buckets means, the size of bucket i+1 is twice
Expand All @@ -189,22 +189,23 @@ def first_bucket_size(k, n):
x * (2^{k+1} - 1) = n
x = n/(2^{k+1} - 1)
"""
return n/(2**(k+1)-1)
return n / (2 ** (k + 1) - 1)

def log_steps(k, n):
"k logarithmic steps whose sum is n"
x = first_bucket_size(k-1, n)
x = first_bucket_size(k - 1, n)
sum = 0
for i in range(k):
sum += 2**i * x
yield sum

bucket_counts = [0 for x in range(buckets)]
for step in log_steps(buckets, diff):
boundaries.append(min_v + step)
else:
buckets = options.buckets and int(options.buckets) or 10
if buckets <= 0:
raise ValueError('# of buckets must be > 0')
raise ValueError("# of buckets must be > 0")
step = diff / buckets
bucket_counts = [0 for x in range(buckets)]
for x in range(buckets):
Expand Down Expand Up @@ -232,69 +233,110 @@ def log_steps(k, n):
if max(bucket_counts) > 75:
bucket_scale = int(max(bucket_counts) / 75)

print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" %
(samples, min_v, max_v))
print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" % (samples, min_v, max_v))
if skipped:
print("# %d value%s outside of min/max" %
(skipped, skipped > 1 and 's' or ''))
print("# %d value%s outside of min/max" % (skipped, skipped > 1 and "s" or ""))
if options.mvsd:
print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
(mvsd.mean(), mvsd.var(), mvsd.sd(),
median(accepted_data, key=lambda x: x.value)))
print "# each " + options.dot + " represents a count of %d" % bucket_scale
print(
"# Mean = %f; Variance = %f; SD = %f; Median %f"
% (
mvsd.mean(),
mvsd.var(),
mvsd.sd(),
median(accepted_data, key=lambda x: x.value),
)
)
print("# each " + options.dot + " represents a count of %d" % bucket_scale)
bucket_min = min_v
bucket_max = min_v
percentage = ""
format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
format_string = options.format + " - " + options.format + " [%6d]: %s%s"
for bucket in range(buckets):
bucket_min = bucket_max
bucket_max = boundaries[bucket]
bucket_count = bucket_counts[bucket]
star_count = 0
if bucket_count:
star_count = bucket_count / bucket_scale
star_count = round(bucket_count / bucket_scale)
if options.percentage:
percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
Decimal(samples))
print format_string % (bucket_min, bucket_max, bucket_count, options.dot *
star_count, percentage)
percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
print(
f"{bucket_min} - {bucket_max} [{bucket_count}] {options.dot * star_count} {percentage}"
)


if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
parser.add_option("-a", "--agg", dest="agg_value_key", default=False,
action="store_true", help="Two column input format, " +
"space seperated with value<space>key")
parser.add_option("-A", "--agg-key-value", dest="agg_key_value",
default=False, action="store_true", help="Two column " +
"input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
help="minimum value for graph")
parser.add_option("-x", "--max", dest="max",
help="maximum value for graph")
parser.add_option("-b", "--buckets", dest="buckets",
help="Number of buckets to use for the histogram")
parser.add_option("-l", "--logscale", dest="logscale", default=False,
action="store_true",
help="Buckets grow in logarithmic scale")
parser.add_option("-B", "--custom-buckets", dest="custbuckets",
help="Comma seperated list of bucket " +
"edges for the histogram")
parser.add_option("--no-mvsd", dest="mvsd", action="store_false",
default=True, help="Disable the calculation of Mean, " +
"Variance and SD (improves performance)")
parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
help="format for bucket numbers")
parser.add_option("-p", "--percentage", dest="percentage", default=False,
action="store_true", help="List percentage for each bar")
parser.add_option("--dot", dest="dot", default='∎', help="Dot representation")
parser.add_option(
"-a",
"--agg",
dest="agg_value_key",
default=False,
action="store_true",
help="Two column input format, " + "space seperated with value<space>key",
)
parser.add_option(
"-A",
"--agg-key-value",
dest="agg_key_value",
default=False,
action="store_true",
help="Two column " + "input format, space seperated with key<space>value",
)
parser.add_option("-m", "--min", dest="min", help="minimum value for graph")
parser.add_option("-x", "--max", dest="max", help="maximum value for graph")
parser.add_option(
"-b",
"--buckets",
dest="buckets",
help="Number of buckets to use for the histogram",
)
parser.add_option(
"-l",
"--logscale",
dest="logscale",
default=False,
action="store_true",
help="Buckets grow in logarithmic scale",
)
parser.add_option(
"-B",
"--custom-buckets",
dest="custbuckets",
help="Comma seperated list of bucket " + "edges for the histogram",
)
parser.add_option(
"--no-mvsd",
dest="mvsd",
action="store_false",
default=True,
help="Disable the calculation of Mean, "
+ "Variance and SD (improves performance)",
)
parser.add_option(
"-f",
"--bucket-format",
dest="format",
default="%10.4f",
help="format for bucket numbers",
)
parser.add_option(
"-p",
"--percentage",
dest="percentage",
default=False,
action="store_true",
help="List percentage for each bar",
)
parser.add_option("--dot", dest="dot", default="∎", help="Dot representation")

(options, args) = parser.parse_args()
if sys.stdin.isatty():
# if isatty() that means it's run without anything piped into it
parser.print_usage()
print "for more help use --help"
print("for more help use --help")
sys.exit(1)
histogram(load_stream(sys.stdin, options.agg_value_key,
options.agg_key_value), options)
histogram(
load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options
)