summaryrefslogtreecommitdiffstats
path: root/toolkit/components/telemetry/histogram_tools.py
diff options
context:
space:
mode:
authorMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
committerMatt A. Tobin <mattatobin@localhost.localdomain>2018-02-02 04:16:08 -0500
commit5f8de423f190bbb79a62f804151bc24824fa32d8 (patch)
tree10027f336435511475e392454359edea8e25895d /toolkit/components/telemetry/histogram_tools.py
parent49ee0794b5d912db1f95dce6eb52d781dc210db5 (diff)
downloadUXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.gz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.lz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.tar.xz
UXP-5f8de423f190bbb79a62f804151bc24824fa32d8.zip
Add m-esr52 at 52.6.0
Diffstat (limited to 'toolkit/components/telemetry/histogram_tools.py')
-rw-r--r--toolkit/components/telemetry/histogram_tools.py513
1 files changed, 513 insertions, 0 deletions
diff --git a/toolkit/components/telemetry/histogram_tools.py b/toolkit/components/telemetry/histogram_tools.py
new file mode 100644
index 000000000..db64be268
--- /dev/null
+++ b/toolkit/components/telemetry/histogram_tools.py
@@ -0,0 +1,513 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import collections
+import itertools
+import json
+import math
+import os
+import re
+import sys
+
+# Constants.
+MAX_LABEL_LENGTH = 20
+MAX_LABEL_COUNT = 100
+
+# histogram_tools.py is used by scripts from a mozilla-central build tree
+# and also by outside consumers, such as the telemetry server. We need
+# to ensure that importing things works in both contexts. Therefore,
+# unconditionally importing things that are local to the build tree, such
+# as buildconfig, is a no-no.
+try:
+ import buildconfig
+
+ # Need to update sys.path to be able to find usecounters.
+ sys.path.append(os.path.join(buildconfig.topsrcdir, 'dom/base/'))
+except ImportError:
+ # Must be in an out-of-tree usage scenario. Trust that whoever is
+ # running this script knows we need the usecounters module and has
+ # ensured it's in our sys.path.
+ pass
+
+from collections import OrderedDict
+
+def table_dispatch(kind, table, body):
+ """Call body with table[kind] if it exists. Raise an error otherwise."""
+ if kind in table:
+ return body(table[kind])
+ else:
+ raise BaseException, "don't know how to handle a histogram of kind %s" % kind
+
+class DefinitionException(BaseException):
+ pass
+
+def linear_buckets(dmin, dmax, n_buckets):
+ ret_array = [0] * n_buckets
+ dmin = float(dmin)
+ dmax = float(dmax)
+ for i in range(1, n_buckets):
+ linear_range = (dmin * (n_buckets - 1 - i) + dmax * (i - 1)) / (n_buckets - 2)
+ ret_array[i] = int(linear_range + 0.5)
+ return ret_array
+
+def exponential_buckets(dmin, dmax, n_buckets):
+ log_max = math.log(dmax);
+ bucket_index = 2;
+ ret_array = [0] * n_buckets
+ current = dmin
+ ret_array[1] = current
+ for bucket_index in range(2, n_buckets):
+ log_current = math.log(current)
+ log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
+ log_next = log_current + log_ratio
+ next_value = int(math.floor(math.exp(log_next) + 0.5))
+ if next_value > current:
+ current = next_value
+ else:
+ current = current + 1
+ ret_array[bucket_index] = current
+ return ret_array
+
+always_allowed_keys = ['kind', 'description', 'cpp_guard', 'expires_in_version',
+ 'alert_emails', 'keyed', 'releaseChannelCollection',
+ 'bug_numbers']
+
+whitelists = None;
+try:
+ whitelist_path = os.path.join(os.path.abspath(os.path.realpath(os.path.dirname(__file__))), 'histogram-whitelists.json')
+ with open(whitelist_path, 'r') as f:
+ try:
+ whitelists = json.load(f)
+ for name, whitelist in whitelists.iteritems():
+ whitelists[name] = set(whitelist)
+ except ValueError, e:
+ raise BaseException, 'error parsing whitelist (%s)' % whitelist_path
+except IOError:
+ whitelists = None
+ print 'Unable to parse whitelist (%s). Assuming all histograms are acceptable.' % whitelist_path
+
+class Histogram:
+ """A class for representing a histogram definition."""
+
+ def __init__(self, name, definition, strict_type_checks=False):
+ """Initialize a histogram named name with the given definition.
+definition is a dict-like object that must contain at least the keys:
+
+ - 'kind': The kind of histogram. Must be one of 'boolean', 'flag',
+ 'count', 'enumerated', 'linear', or 'exponential'.
+ - 'description': A textual description of the histogram.
+ - 'strict_type_checks': A boolean indicating whether to use the new, stricter type checks.
+ The server-side still has to deal with old, oddly typed submissions,
+ so we have to skip them there by default.
+
+The key 'cpp_guard' is optional; if present, it denotes a preprocessor
+symbol that should guard C/C++ definitions associated with the histogram."""
+ self._strict_type_checks = strict_type_checks
+ self._is_use_counter = name.startswith("USE_COUNTER2_")
+ self.verify_attributes(name, definition)
+ self._name = name
+ self._description = definition['description']
+ self._kind = definition['kind']
+ self._cpp_guard = definition.get('cpp_guard')
+ self._keyed = definition.get('keyed', False)
+ self._expiration = definition.get('expires_in_version')
+ self._labels = definition.get('labels', [])
+ self.compute_bucket_parameters(definition)
+ table = {
+ 'boolean': 'BOOLEAN',
+ 'flag': 'FLAG',
+ 'count': 'COUNT',
+ 'enumerated': 'LINEAR',
+ 'categorical': 'CATEGORICAL',
+ 'linear': 'LINEAR',
+ 'exponential': 'EXPONENTIAL',
+ }
+ table_dispatch(self.kind(), table,
+ lambda k: self._set_nsITelemetry_kind(k))
+ datasets = { 'opt-in': 'DATASET_RELEASE_CHANNEL_OPTIN',
+ 'opt-out': 'DATASET_RELEASE_CHANNEL_OPTOUT' }
+ value = definition.get('releaseChannelCollection', 'opt-in')
+ if not value in datasets:
+ raise DefinitionException, "unknown release channel collection policy for " + name
+ self._dataset = "nsITelemetry::" + datasets[value]
+
+ def name(self):
+ """Return the name of the histogram."""
+ return self._name
+
+ def description(self):
+ """Return the description of the histogram."""
+ return self._description
+
+ def kind(self):
+ """Return the kind of the histogram.
+Will be one of 'boolean', 'flag', 'count', 'enumerated', 'categorical', 'linear',
+or 'exponential'."""
+ return self._kind
+
+ def expiration(self):
+ """Return the expiration version of the histogram."""
+ return self._expiration
+
+ def nsITelemetry_kind(self):
+ """Return the nsITelemetry constant corresponding to the kind of
+the histogram."""
+ return self._nsITelemetry_kind
+
+ def _set_nsITelemetry_kind(self, kind):
+ self._nsITelemetry_kind = "nsITelemetry::HISTOGRAM_%s" % kind
+
+ def low(self):
+ """Return the lower bound of the histogram."""
+ return self._low
+
+ def high(self):
+ """Return the high bound of the histogram."""
+ return self._high
+
+ def n_buckets(self):
+ """Return the number of buckets in the histogram."""
+ return self._n_buckets
+
+ def cpp_guard(self):
+ """Return the preprocessor symbol that should guard C/C++ definitions
+associated with the histogram. Returns None if no guarding is necessary."""
+ return self._cpp_guard
+
+ def keyed(self):
+ """Returns True if this a keyed histogram, false otherwise."""
+ return self._keyed
+
+ def dataset(self):
+ """Returns the dataset this histogram belongs into."""
+ return self._dataset
+
+ def labels(self):
+ """Returns a list of labels for a categorical histogram, [] for others."""
+ return self._labels
+
+ def ranges(self):
+ """Return an array of lower bounds for each bucket in the histogram."""
+ table = {
+ 'boolean': linear_buckets,
+ 'flag': linear_buckets,
+ 'count': linear_buckets,
+ 'enumerated': linear_buckets,
+ 'categorical': linear_buckets,
+ 'linear': linear_buckets,
+ 'exponential': exponential_buckets,
+ }
+ return table_dispatch(self.kind(), table,
+ lambda p: p(self.low(), self.high(), self.n_buckets()))
+
+ def compute_bucket_parameters(self, definition):
+ table = {
+ 'boolean': Histogram.boolean_flag_bucket_parameters,
+ 'flag': Histogram.boolean_flag_bucket_parameters,
+ 'count': Histogram.boolean_flag_bucket_parameters,
+ 'enumerated': Histogram.enumerated_bucket_parameters,
+ 'categorical': Histogram.categorical_bucket_parameters,
+ 'linear': Histogram.linear_bucket_parameters,
+ 'exponential': Histogram.exponential_bucket_parameters,
+ }
+ table_dispatch(self.kind(), table,
+ lambda p: self.set_bucket_parameters(*p(definition)))
+
+ def verify_attributes(self, name, definition):
+ global always_allowed_keys
+ general_keys = always_allowed_keys + ['low', 'high', 'n_buckets']
+
+ table = {
+ 'boolean': always_allowed_keys,
+ 'flag': always_allowed_keys,
+ 'count': always_allowed_keys,
+ 'enumerated': always_allowed_keys + ['n_values'],
+ 'categorical': always_allowed_keys + ['labels'],
+ 'linear': general_keys,
+ 'exponential': general_keys,
+ }
+ # We removed extended_statistics_ok on the client, but the server-side,
+ # where _strict_type_checks==False, has to deal with historical data.
+ if not self._strict_type_checks:
+ table['exponential'].append('extended_statistics_ok')
+
+ table_dispatch(definition['kind'], table,
+ lambda allowed_keys: Histogram.check_keys(name, definition, allowed_keys))
+
+ self.check_name(name)
+ self.check_field_types(name, definition)
+ self.check_whitelistable_fields(name, definition)
+ self.check_expiration(name, definition)
+ self.check_label_values(name, definition)
+
+ def check_name(self, name):
+ if '#' in name:
+ raise ValueError, '"#" not permitted for %s' % (name)
+
+ # Avoid C++ identifier conflicts between histogram enums and label enum names.
+ if name.startswith("LABELS_"):
+ raise ValueError, "Histogram name '%s' can not start with LABELS_" % (name)
+
+ # To make it easier to generate C++ identifiers from this etc., we restrict
+ # the histogram names to a strict pattern.
+ # We skip this on the server to avoid failures with old Histogram.json revisions.
+ if self._strict_type_checks:
+ pattern = '^[a-z][a-z0-9_]+[a-z0-9]$'
+ if not re.match(pattern, name, re.IGNORECASE):
+ raise ValueError, "Histogram name '%s' doesn't confirm to '%s'" % (name, pattern)
+
+ def check_expiration(self, name, definition):
+ field = 'expires_in_version'
+ expiration = definition.get(field)
+
+ if not expiration:
+ return
+
+ # We forbid new probes from using "expires_in_version" : "default" field/value pair.
+ # Old ones that use this are added to the whitelist.
+ if expiration == "default" and name not in whitelists['expiry_default']:
+ raise ValueError, 'New histogram "%s" cannot have "default" %s value.' % (name, field)
+
+ if re.match(r'^[1-9][0-9]*$', expiration):
+ expiration = expiration + ".0a1"
+ elif re.match(r'^[1-9][0-9]*\.0$', expiration):
+ expiration = expiration + "a1"
+
+ definition[field] = expiration
+
+ def check_label_values(self, name, definition):
+ labels = definition.get('labels')
+ if not labels:
+ return
+
+ invalid = filter(lambda l: len(l) > MAX_LABEL_LENGTH, labels)
+ if len(invalid) > 0:
+ raise ValueError, 'Label values for %s exceed length limit of %d: %s' % \
+ (name, MAX_LABEL_LENGTH, ', '.join(invalid))
+
+ if len(labels) > MAX_LABEL_COUNT:
+ raise ValueError, 'Label count for %s exceeds limit of %d' % \
+ (name, MAX_LABEL_COUNT)
+
+ # To make it easier to generate C++ identifiers from this etc., we restrict
+ # the label values to a strict pattern.
+ pattern = '^[a-z][a-z0-9_]+[a-z0-9]$'
+ invalid = filter(lambda l: not re.match(pattern, l, re.IGNORECASE), labels)
+ if len(invalid) > 0:
+ raise ValueError, 'Label values for %s are not matching pattern "%s": %s' % \
+ (name, pattern, ', '.join(invalid))
+
+ # Check for the presence of fields that old histograms are whitelisted for.
+ def check_whitelistable_fields(self, name, definition):
+ # Use counters don't have any mechanism to add the fields checked here,
+ # so skip the check for them.
+ # We also don't need to run any of these checks on the server.
+ if self._is_use_counter or not self._strict_type_checks:
+ return
+
+ # In the pipeline we don't have whitelists available.
+ if whitelists is None:
+ return
+
+ for field in ['alert_emails', 'bug_numbers']:
+ if field not in definition and name not in whitelists[field]:
+ raise KeyError, 'New histogram "%s" must have a %s field.' % (name, field)
+ if field in definition and name in whitelists[field]:
+ msg = 'Should remove histogram "%s" from the whitelist for "%s" in histogram-whitelists.json'
+ raise KeyError, msg % (name, field)
+
+ def check_field_types(self, name, definition):
+ # Define expected types for the histogram properties.
+ type_checked_fields = {
+ "n_buckets": int,
+ "n_values": int,
+ "low": int,
+ "high": int,
+ "keyed": bool,
+ "expires_in_version": basestring,
+ "kind": basestring,
+ "description": basestring,
+ "cpp_guard": basestring,
+ "releaseChannelCollection": basestring,
+ }
+
+ # For list fields we check the items types.
+ type_checked_list_fields = {
+ "bug_numbers": int,
+ "alert_emails": basestring,
+ "labels": basestring,
+ }
+
+ # For the server-side, where _strict_type_checks==False, we want to
+ # skip the stricter type checks for these fields for dealing with
+ # historical data.
+ coerce_fields = ["low", "high", "n_values", "n_buckets"]
+ if not self._strict_type_checks:
+ def try_to_coerce_to_number(v):
+ try:
+ return eval(v, {})
+ except:
+ return v
+ for key in [k for k in coerce_fields if k in definition]:
+ definition[key] = try_to_coerce_to_number(definition[key])
+ # This handles old "keyed":"true" definitions (bug 1271986).
+ if definition.get("keyed", None) == "true":
+ definition["keyed"] = True
+
+ def nice_type_name(t):
+ if t is basestring:
+ return "string"
+ return t.__name__
+
+ for key, key_type in type_checked_fields.iteritems():
+ if not key in definition:
+ continue
+ if not isinstance(definition[key], key_type):
+ raise ValueError, ('value for key "{0}" in Histogram "{1}" '
+ 'should be {2}').format(key, name, nice_type_name(key_type))
+
+ for key, key_type in type_checked_list_fields.iteritems():
+ if not key in definition:
+ continue
+ if not all(isinstance(x, key_type) for x in definition[key]):
+ raise ValueError, ('all values for list "{0}" in Histogram "{1}" '
+ 'should be {2}').format(key, name, nice_type_name(key_type))
+
+ @staticmethod
+ def check_keys(name, definition, allowed_keys):
+ for key in definition.iterkeys():
+ if key not in allowed_keys:
+ raise KeyError, '%s not permitted for %s' % (key, name)
+
+ def set_bucket_parameters(self, low, high, n_buckets):
+ self._low = low
+ self._high = high
+ self._n_buckets = n_buckets
+ if whitelists is not None and self._n_buckets > 100 and type(self._n_buckets) is int:
+ if self._name not in whitelists['n_buckets']:
+ raise KeyError, ('New histogram "%s" is not permitted to have more than 100 buckets. '
+ 'Histograms with large numbers of buckets use disproportionately high amounts of resources. '
+ 'Contact the Telemetry team (e.g. in #telemetry) if you think an exception ought to be made.' % self._name)
+
+ @staticmethod
+ def boolean_flag_bucket_parameters(definition):
+ return (1, 2, 3)
+
+ @staticmethod
+ def linear_bucket_parameters(definition):
+ return (definition.get('low', 1),
+ definition['high'],
+ definition['n_buckets'])
+
+ @staticmethod
+ def enumerated_bucket_parameters(definition):
+ n_values = definition['n_values']
+ return (1, n_values, n_values + 1)
+
+ @staticmethod
+ def categorical_bucket_parameters(definition):
+ n_values = len(definition['labels'])
+ return (1, n_values, n_values + 1)
+
+ @staticmethod
+ def exponential_bucket_parameters(definition):
+ return (definition.get('low', 1),
+ definition['high'],
+ definition['n_buckets'])
+
+# We support generating histograms from multiple different input files, not
+# just Histograms.json. For each file's basename, we have a specific
+# routine to parse that file, and return a dictionary mapping histogram
+# names to histogram parameters.
+def from_Histograms_json(filename):
+ with open(filename, 'r') as f:
+ try:
+ histograms = json.load(f, object_pairs_hook=OrderedDict)
+ except ValueError, e:
+ raise BaseException, "error parsing histograms in %s: %s" % (filename, e.message)
+ return histograms
+
+def from_UseCounters_conf(filename):
+ return usecounters.generate_histograms(filename)
+
+def from_nsDeprecatedOperationList(filename):
+ operation_regex = re.compile('^DEPRECATED_OPERATION\\(([^)]+)\\)')
+ histograms = collections.OrderedDict()
+
+ with open(filename, 'r') as f:
+ for line in f:
+ match = operation_regex.search(line)
+ if not match:
+ continue
+
+ op = match.group(1)
+
+ def add_counter(context):
+ name = 'USE_COUNTER2_DEPRECATED_%s_%s' % (op, context.upper())
+ histograms[name] = {
+ 'expires_in_version': 'never',
+ 'kind': 'boolean',
+ 'description': 'Whether a %s used %s' % (context, op)
+ }
+ add_counter('document')
+ add_counter('page')
+
+ return histograms
+
+FILENAME_PARSERS = {
+ 'Histograms.json': from_Histograms_json,
+ 'nsDeprecatedOperationList.h': from_nsDeprecatedOperationList,
+}
+
+# Similarly to the dance above with buildconfig, usecounters may not be
+# available, so handle that gracefully.
+try:
+ import usecounters
+
+ FILENAME_PARSERS['UseCounters.conf'] = from_UseCounters_conf
+except ImportError:
+ pass
+
+def from_files(filenames):
+ """Return an iterator that provides a sequence of Histograms for
+the histograms defined in filenames.
+ """
+ all_histograms = OrderedDict()
+ for filename in filenames:
+ parser = FILENAME_PARSERS[os.path.basename(filename)]
+ histograms = parser(filename)
+
+ # OrderedDicts are important, because then the iteration order over
+ # the parsed histograms is stable, which makes the insertion into
+ # all_histograms stable, which makes ordering in generated files
+ # stable, which makes builds more deterministic.
+ if not isinstance(histograms, OrderedDict):
+ raise BaseException, "histogram parser didn't provide an OrderedDict"
+
+ for (name, definition) in histograms.iteritems():
+ if all_histograms.has_key(name):
+ raise DefinitionException, "duplicate histogram name %s" % name
+ all_histograms[name] = definition
+
+ # We require that all USE_COUNTER2_* histograms be defined in a contiguous
+ # block.
+ use_counter_indices = filter(lambda x: x[1].startswith("USE_COUNTER2_"),
+ enumerate(all_histograms.iterkeys()));
+ if use_counter_indices:
+ lower_bound = use_counter_indices[0][0]
+ upper_bound = use_counter_indices[-1][0]
+ n_counters = upper_bound - lower_bound + 1
+ if n_counters != len(use_counter_indices):
+ raise DefinitionException, "use counter histograms must be defined in a contiguous block"
+
+ # Check that histograms that were removed from Histograms.json etc. are also removed from the whitelists.
+ if whitelists is not None:
+ all_whitelist_entries = itertools.chain.from_iterable(whitelists.itervalues())
+ orphaned = set(all_whitelist_entries) - set(all_histograms.keys())
+ if len(orphaned) > 0:
+ msg = 'The following entries are orphaned and should be removed from histogram-whitelists.json: %s'
+ raise BaseException, msg % (', '.join(sorted(orphaned)))
+
+ for (name, definition) in all_histograms.iteritems():
+ yield Histogram(name, definition, strict_type_checks=True)