diff options
Diffstat (limited to 'toolkit/components/telemetry/histogram_tools.py')
-rw-r--r-- | toolkit/components/telemetry/histogram_tools.py | 513 |
1 files changed, 513 insertions, 0 deletions
diff --git a/toolkit/components/telemetry/histogram_tools.py b/toolkit/components/telemetry/histogram_tools.py new file mode 100644 index 000000000..db64be268 --- /dev/null +++ b/toolkit/components/telemetry/histogram_tools.py @@ -0,0 +1,513 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import collections +import itertools +import json +import math +import os +import re +import sys + +# Constants. +MAX_LABEL_LENGTH = 20 +MAX_LABEL_COUNT = 100 + +# histogram_tools.py is used by scripts from a mozilla-central build tree +# and also by outside consumers, such as the telemetry server. We need +# to ensure that importing things works in both contexts. Therefore, +# unconditionally importing things that are local to the build tree, such +# as buildconfig, is a no-no. +try: + import buildconfig + + # Need to update sys.path to be able to find usecounters. + sys.path.append(os.path.join(buildconfig.topsrcdir, 'dom/base/')) +except ImportError: + # Must be in an out-of-tree usage scenario. Trust that whoever is + # running this script knows we need the usecounters module and has + # ensured it's in our sys.path. + pass + +from collections import OrderedDict + +def table_dispatch(kind, table, body): + """Call body with table[kind] if it exists. Raise an error otherwise.""" + if kind in table: + return body(table[kind]) + else: + raise BaseException, "don't know how to handle a histogram of kind %s" % kind + +class DefinitionException(BaseException): + pass + +def linear_buckets(dmin, dmax, n_buckets): + ret_array = [0] * n_buckets + dmin = float(dmin) + dmax = float(dmax) + for i in range(1, n_buckets): + linear_range = (dmin * (n_buckets - 1 - i) + dmax * (i - 1)) / (n_buckets - 2) + ret_array[i] = int(linear_range + 0.5) + return ret_array + +def exponential_buckets(dmin, dmax, n_buckets): + log_max = math.log(dmax); + bucket_index = 2; + ret_array = [0] * n_buckets + current = dmin + ret_array[1] = current + for bucket_index in range(2, n_buckets): + log_current = math.log(current) + log_ratio = (log_max - log_current) / (n_buckets - bucket_index) + log_next = log_current + log_ratio + next_value = int(math.floor(math.exp(log_next) + 0.5)) + if next_value > current: + current = next_value + else: + current = current + 1 + ret_array[bucket_index] = current + return ret_array + +always_allowed_keys = ['kind', 'description', 'cpp_guard', 'expires_in_version', + 'alert_emails', 'keyed', 'releaseChannelCollection', + 'bug_numbers'] + +whitelists = None; +try: + whitelist_path = os.path.join(os.path.abspath(os.path.realpath(os.path.dirname(__file__))), 'histogram-whitelists.json') + with open(whitelist_path, 'r') as f: + try: + whitelists = json.load(f) + for name, whitelist in whitelists.iteritems(): + whitelists[name] = set(whitelist) + except ValueError, e: + raise BaseException, 'error parsing whitelist (%s)' % whitelist_path +except IOError: + whitelists = None + print 'Unable to parse whitelist (%s). Assuming all histograms are acceptable.' % whitelist_path + +class Histogram: + """A class for representing a histogram definition.""" + + def __init__(self, name, definition, strict_type_checks=False): + """Initialize a histogram named name with the given definition. +definition is a dict-like object that must contain at least the keys: + + - 'kind': The kind of histogram. Must be one of 'boolean', 'flag', + 'count', 'enumerated', 'linear', or 'exponential'. + - 'description': A textual description of the histogram. + - 'strict_type_checks': A boolean indicating whether to use the new, stricter type checks. + The server-side still has to deal with old, oddly typed submissions, + so we have to skip them there by default. + +The key 'cpp_guard' is optional; if present, it denotes a preprocessor +symbol that should guard C/C++ definitions associated with the histogram.""" + self._strict_type_checks = strict_type_checks + self._is_use_counter = name.startswith("USE_COUNTER2_") + self.verify_attributes(name, definition) + self._name = name + self._description = definition['description'] + self._kind = definition['kind'] + self._cpp_guard = definition.get('cpp_guard') + self._keyed = definition.get('keyed', False) + self._expiration = definition.get('expires_in_version') + self._labels = definition.get('labels', []) + self.compute_bucket_parameters(definition) + table = { + 'boolean': 'BOOLEAN', + 'flag': 'FLAG', + 'count': 'COUNT', + 'enumerated': 'LINEAR', + 'categorical': 'CATEGORICAL', + 'linear': 'LINEAR', + 'exponential': 'EXPONENTIAL', + } + table_dispatch(self.kind(), table, + lambda k: self._set_nsITelemetry_kind(k)) + datasets = { 'opt-in': 'DATASET_RELEASE_CHANNEL_OPTIN', + 'opt-out': 'DATASET_RELEASE_CHANNEL_OPTOUT' } + value = definition.get('releaseChannelCollection', 'opt-in') + if not value in datasets: + raise DefinitionException, "unknown release channel collection policy for " + name + self._dataset = "nsITelemetry::" + datasets[value] + + def name(self): + """Return the name of the histogram.""" + return self._name + + def description(self): + """Return the description of the histogram.""" + return self._description + + def kind(self): + """Return the kind of the histogram. +Will be one of 'boolean', 'flag', 'count', 'enumerated', 'categorical', 'linear', +or 'exponential'.""" + return self._kind + + def expiration(self): + """Return the expiration version of the histogram.""" + return self._expiration + + def nsITelemetry_kind(self): + """Return the nsITelemetry constant corresponding to the kind of +the histogram.""" + return self._nsITelemetry_kind + + def _set_nsITelemetry_kind(self, kind): + self._nsITelemetry_kind = "nsITelemetry::HISTOGRAM_%s" % kind + + def low(self): + """Return the lower bound of the histogram.""" + return self._low + + def high(self): + """Return the high bound of the histogram.""" + return self._high + + def n_buckets(self): + """Return the number of buckets in the histogram.""" + return self._n_buckets + + def cpp_guard(self): + """Return the preprocessor symbol that should guard C/C++ definitions +associated with the histogram. Returns None if no guarding is necessary.""" + return self._cpp_guard + + def keyed(self): + """Returns True if this a keyed histogram, false otherwise.""" + return self._keyed + + def dataset(self): + """Returns the dataset this histogram belongs into.""" + return self._dataset + + def labels(self): + """Returns a list of labels for a categorical histogram, [] for others.""" + return self._labels + + def ranges(self): + """Return an array of lower bounds for each bucket in the histogram.""" + table = { + 'boolean': linear_buckets, + 'flag': linear_buckets, + 'count': linear_buckets, + 'enumerated': linear_buckets, + 'categorical': linear_buckets, + 'linear': linear_buckets, + 'exponential': exponential_buckets, + } + return table_dispatch(self.kind(), table, + lambda p: p(self.low(), self.high(), self.n_buckets())) + + def compute_bucket_parameters(self, definition): + table = { + 'boolean': Histogram.boolean_flag_bucket_parameters, + 'flag': Histogram.boolean_flag_bucket_parameters, + 'count': Histogram.boolean_flag_bucket_parameters, + 'enumerated': Histogram.enumerated_bucket_parameters, + 'categorical': Histogram.categorical_bucket_parameters, + 'linear': Histogram.linear_bucket_parameters, + 'exponential': Histogram.exponential_bucket_parameters, + } + table_dispatch(self.kind(), table, + lambda p: self.set_bucket_parameters(*p(definition))) + + def verify_attributes(self, name, definition): + global always_allowed_keys + general_keys = always_allowed_keys + ['low', 'high', 'n_buckets'] + + table = { + 'boolean': always_allowed_keys, + 'flag': always_allowed_keys, + 'count': always_allowed_keys, + 'enumerated': always_allowed_keys + ['n_values'], + 'categorical': always_allowed_keys + ['labels'], + 'linear': general_keys, + 'exponential': general_keys, + } + # We removed extended_statistics_ok on the client, but the server-side, + # where _strict_type_checks==False, has to deal with historical data. + if not self._strict_type_checks: + table['exponential'].append('extended_statistics_ok') + + table_dispatch(definition['kind'], table, + lambda allowed_keys: Histogram.check_keys(name, definition, allowed_keys)) + + self.check_name(name) + self.check_field_types(name, definition) + self.check_whitelistable_fields(name, definition) + self.check_expiration(name, definition) + self.check_label_values(name, definition) + + def check_name(self, name): + if '#' in name: + raise ValueError, '"#" not permitted for %s' % (name) + + # Avoid C++ identifier conflicts between histogram enums and label enum names. + if name.startswith("LABELS_"): + raise ValueError, "Histogram name '%s' can not start with LABELS_" % (name) + + # To make it easier to generate C++ identifiers from this etc., we restrict + # the histogram names to a strict pattern. + # We skip this on the server to avoid failures with old Histogram.json revisions. + if self._strict_type_checks: + pattern = '^[a-z][a-z0-9_]+[a-z0-9]$' + if not re.match(pattern, name, re.IGNORECASE): + raise ValueError, "Histogram name '%s' doesn't confirm to '%s'" % (name, pattern) + + def check_expiration(self, name, definition): + field = 'expires_in_version' + expiration = definition.get(field) + + if not expiration: + return + + # We forbid new probes from using "expires_in_version" : "default" field/value pair. + # Old ones that use this are added to the whitelist. + if expiration == "default" and name not in whitelists['expiry_default']: + raise ValueError, 'New histogram "%s" cannot have "default" %s value.' % (name, field) + + if re.match(r'^[1-9][0-9]*$', expiration): + expiration = expiration + ".0a1" + elif re.match(r'^[1-9][0-9]*\.0$', expiration): + expiration = expiration + "a1" + + definition[field] = expiration + + def check_label_values(self, name, definition): + labels = definition.get('labels') + if not labels: + return + + invalid = filter(lambda l: len(l) > MAX_LABEL_LENGTH, labels) + if len(invalid) > 0: + raise ValueError, 'Label values for %s exceed length limit of %d: %s' % \ + (name, MAX_LABEL_LENGTH, ', '.join(invalid)) + + if len(labels) > MAX_LABEL_COUNT: + raise ValueError, 'Label count for %s exceeds limit of %d' % \ + (name, MAX_LABEL_COUNT) + + # To make it easier to generate C++ identifiers from this etc., we restrict + # the label values to a strict pattern. + pattern = '^[a-z][a-z0-9_]+[a-z0-9]$' + invalid = filter(lambda l: not re.match(pattern, l, re.IGNORECASE), labels) + if len(invalid) > 0: + raise ValueError, 'Label values for %s are not matching pattern "%s": %s' % \ + (name, pattern, ', '.join(invalid)) + + # Check for the presence of fields that old histograms are whitelisted for. + def check_whitelistable_fields(self, name, definition): + # Use counters don't have any mechanism to add the fields checked here, + # so skip the check for them. + # We also don't need to run any of these checks on the server. + if self._is_use_counter or not self._strict_type_checks: + return + + # In the pipeline we don't have whitelists available. + if whitelists is None: + return + + for field in ['alert_emails', 'bug_numbers']: + if field not in definition and name not in whitelists[field]: + raise KeyError, 'New histogram "%s" must have a %s field.' % (name, field) + if field in definition and name in whitelists[field]: + msg = 'Should remove histogram "%s" from the whitelist for "%s" in histogram-whitelists.json' + raise KeyError, msg % (name, field) + + def check_field_types(self, name, definition): + # Define expected types for the histogram properties. + type_checked_fields = { + "n_buckets": int, + "n_values": int, + "low": int, + "high": int, + "keyed": bool, + "expires_in_version": basestring, + "kind": basestring, + "description": basestring, + "cpp_guard": basestring, + "releaseChannelCollection": basestring, + } + + # For list fields we check the items types. + type_checked_list_fields = { + "bug_numbers": int, + "alert_emails": basestring, + "labels": basestring, + } + + # For the server-side, where _strict_type_checks==False, we want to + # skip the stricter type checks for these fields for dealing with + # historical data. + coerce_fields = ["low", "high", "n_values", "n_buckets"] + if not self._strict_type_checks: + def try_to_coerce_to_number(v): + try: + return eval(v, {}) + except: + return v + for key in [k for k in coerce_fields if k in definition]: + definition[key] = try_to_coerce_to_number(definition[key]) + # This handles old "keyed":"true" definitions (bug 1271986). + if definition.get("keyed", None) == "true": + definition["keyed"] = True + + def nice_type_name(t): + if t is basestring: + return "string" + return t.__name__ + + for key, key_type in type_checked_fields.iteritems(): + if not key in definition: + continue + if not isinstance(definition[key], key_type): + raise ValueError, ('value for key "{0}" in Histogram "{1}" ' + 'should be {2}').format(key, name, nice_type_name(key_type)) + + for key, key_type in type_checked_list_fields.iteritems(): + if not key in definition: + continue + if not all(isinstance(x, key_type) for x in definition[key]): + raise ValueError, ('all values for list "{0}" in Histogram "{1}" ' + 'should be {2}').format(key, name, nice_type_name(key_type)) + + @staticmethod + def check_keys(name, definition, allowed_keys): + for key in definition.iterkeys(): + if key not in allowed_keys: + raise KeyError, '%s not permitted for %s' % (key, name) + + def set_bucket_parameters(self, low, high, n_buckets): + self._low = low + self._high = high + self._n_buckets = n_buckets + if whitelists is not None and self._n_buckets > 100 and type(self._n_buckets) is int: + if self._name not in whitelists['n_buckets']: + raise KeyError, ('New histogram "%s" is not permitted to have more than 100 buckets. ' + 'Histograms with large numbers of buckets use disproportionately high amounts of resources. ' + 'Contact the Telemetry team (e.g. in #telemetry) if you think an exception ought to be made.' % self._name) + + @staticmethod + def boolean_flag_bucket_parameters(definition): + return (1, 2, 3) + + @staticmethod + def linear_bucket_parameters(definition): + return (definition.get('low', 1), + definition['high'], + definition['n_buckets']) + + @staticmethod + def enumerated_bucket_parameters(definition): + n_values = definition['n_values'] + return (1, n_values, n_values + 1) + + @staticmethod + def categorical_bucket_parameters(definition): + n_values = len(definition['labels']) + return (1, n_values, n_values + 1) + + @staticmethod + def exponential_bucket_parameters(definition): + return (definition.get('low', 1), + definition['high'], + definition['n_buckets']) + +# We support generating histograms from multiple different input files, not +# just Histograms.json. For each file's basename, we have a specific +# routine to parse that file, and return a dictionary mapping histogram +# names to histogram parameters. +def from_Histograms_json(filename): + with open(filename, 'r') as f: + try: + histograms = json.load(f, object_pairs_hook=OrderedDict) + except ValueError, e: + raise BaseException, "error parsing histograms in %s: %s" % (filename, e.message) + return histograms + +def from_UseCounters_conf(filename): + return usecounters.generate_histograms(filename) + +def from_nsDeprecatedOperationList(filename): + operation_regex = re.compile('^DEPRECATED_OPERATION\\(([^)]+)\\)') + histograms = collections.OrderedDict() + + with open(filename, 'r') as f: + for line in f: + match = operation_regex.search(line) + if not match: + continue + + op = match.group(1) + + def add_counter(context): + name = 'USE_COUNTER2_DEPRECATED_%s_%s' % (op, context.upper()) + histograms[name] = { + 'expires_in_version': 'never', + 'kind': 'boolean', + 'description': 'Whether a %s used %s' % (context, op) + } + add_counter('document') + add_counter('page') + + return histograms + +FILENAME_PARSERS = { + 'Histograms.json': from_Histograms_json, + 'nsDeprecatedOperationList.h': from_nsDeprecatedOperationList, +} + +# Similarly to the dance above with buildconfig, usecounters may not be +# available, so handle that gracefully. +try: + import usecounters + + FILENAME_PARSERS['UseCounters.conf'] = from_UseCounters_conf +except ImportError: + pass + +def from_files(filenames): + """Return an iterator that provides a sequence of Histograms for +the histograms defined in filenames. + """ + all_histograms = OrderedDict() + for filename in filenames: + parser = FILENAME_PARSERS[os.path.basename(filename)] + histograms = parser(filename) + + # OrderedDicts are important, because then the iteration order over + # the parsed histograms is stable, which makes the insertion into + # all_histograms stable, which makes ordering in generated files + # stable, which makes builds more deterministic. + if not isinstance(histograms, OrderedDict): + raise BaseException, "histogram parser didn't provide an OrderedDict" + + for (name, definition) in histograms.iteritems(): + if all_histograms.has_key(name): + raise DefinitionException, "duplicate histogram name %s" % name + all_histograms[name] = definition + + # We require that all USE_COUNTER2_* histograms be defined in a contiguous + # block. + use_counter_indices = filter(lambda x: x[1].startswith("USE_COUNTER2_"), + enumerate(all_histograms.iterkeys())); + if use_counter_indices: + lower_bound = use_counter_indices[0][0] + upper_bound = use_counter_indices[-1][0] + n_counters = upper_bound - lower_bound + 1 + if n_counters != len(use_counter_indices): + raise DefinitionException, "use counter histograms must be defined in a contiguous block" + + # Check that histograms that were removed from Histograms.json etc. are also removed from the whitelists. + if whitelists is not None: + all_whitelist_entries = itertools.chain.from_iterable(whitelists.itervalues()) + orphaned = set(all_whitelist_entries) - set(all_histograms.keys()) + if len(orphaned) > 0: + msg = 'The following entries are orphaned and should be removed from histogram-whitelists.json: %s' + raise BaseException, msg % (', '.join(sorted(orphaned))) + + for (name, definition) in all_histograms.iteritems(): + yield Histogram(name, definition, strict_type_checks=True) |