/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #include "WebGLElementArrayCache.h" #include <algorithm> #include <cstdlib> #include <cstring> #include <limits> #include "mozilla/Assertions.h" #include "mozilla/MathAlgorithms.h" #include "mozilla/MemoryReporting.h" namespace mozilla { /* WebGLElementArrayCacheTree contains most of the implementation of * WebGLElementArrayCache, which performs WebGL element array buffer validation * for drawElements. * * Attention: Here lie nontrivial data structures, bug-prone algorithms, and * non-canonical tweaks! Whence the explanatory comments, and compiled unit * test. * * *** What problem are we solving here? *** * * WebGL::DrawElements has to validate that the elements are in range wrt the * current vertex attribs. This boils down to the problem, given an array of * integers, of computing the maximum in an arbitrary sub-array. The naive * algorithm has linear complexity; this has been a major performance problem, * see bug 569431. In that bug, we took the approach of caching the max for the * whole array, which does cover most cases (DrawElements typically consumes the * whole element array buffer) but doesn't help in other use cases: * - when doing "partial DrawElements" i.e. consuming only part of the element * array buffer * - when doing frequent "partial buffer updates" i.e. bufferSubData calls * updating parts of the element array buffer * * *** The solution: A binary tree *** * * The solution implemented here is to use a binary tree as the cache data * structure. Each tree node contains the max of its two children nodes. In this * way, finding the maximum in any contiguous sub-array has log complexity * instead of linear complexity. * * Simplistically, if the element array is: * * [1 4 3 2] * * then the corresponding tree is: * * 4 * _/ \_ * 4 3 * / \ / \ * 1 4 3 2 * * In practice, the bottom-most levels of the tree are both the largest to store * (because they have more nodes), and the least useful performance-wise * (because each node in the bottom levels concerns only few entries in the * elements array buffer, it is cheap to compute). * * For this reason, we stop the tree a few levels above, so that each tree leaf * actually corresponds to more than one element array entry. * * The number of levels that we "drop" is |kSkippedBottomTreeLevels| and the * number of element array entries that each leaf corresponds to, is * |kElementsPerLeaf|. This being a binary tree, we have: * * kElementsPerLeaf = 2 ^ kSkippedBottomTreeLevels. * * *** Storage layout of the binary tree *** * * We take advantage of the specifics of the situation to avoid generalist tree * storage and instead store the tree entries in a vector, mTreeData. * * TreeData is always a vector of length: * * 2 * (number of leaves). * * Its data layout is as follows: mTreeData[0] is unused, mTreeData[1] is the * root node, then at offsets 2..3 is the tree level immediately below the root * node, then at offsets 4..7 is the tree level below that, etc. * * The figure below illustrates this by writing at each tree node the offset * into mTreeData at which it is stored: * * 1 * _/ \_ * 2 3 * / \ / \ * 4 5 6 7 * ... * * Thus, under the convention that the root level is level 0, we see that level * N is stored at offsets: * * [ 2^n .. 2^(n+1) - 1 ] * * in mTreeData. Likewise, all the usual tree operations have simple * mathematical expressions in terms of mTreeData offsets, see all the methods * such as ParentNode, LeftChildNode, etc. * * *** Design constraint: Element types aren't known at buffer-update time *** * * Note that a key constraint that we're operating under, is that we don't know * the types of the elements by the time WebGL bufferData/bufferSubData methods * are called. The type of elements is only specified in the drawElements call. * This means that we may potentially have to store caches for multiple element * types, for the same element array buffer. Since we don't know yet how many * element types we'll eventually support (extensions add more), the concern * about memory usage is serious. This is addressed by kSkippedBottomTreeLevels * as explained above. Of course, in the typical case where each element array * buffer is only ever used with one type, this is also addressed by having * WebGLElementArrayCache lazily create trees for each type only upon first use. * * Another consequence of this constraint is that when updating the trees, we * have to update all existing trees. So if trees for types uint8_t, uint16_t * and uint32_t have ever been constructed for this buffer, every subsequent * update will have to update all trees even if one of the types is never used * again. That's inefficient, but content should not put indices of different * types in the same element array buffer anyways. Different index types can * only be consumed in separate drawElements calls, so nothing particular is * to be achieved by lumping them in the same buffer object. */ template<typename T> struct WebGLElementArrayCacheTree { /* A too-high kSkippedBottomTreeLevels would harm the performance of small * drawElements calls. A too-low kSkippedBottomTreeLevels would cause undue * memory usage. The current value has been validated by some benchmarking. * See bug 732660. */ static const size_t kSkippedBottomTreeLevels = 3; static const size_t kElementsPerLeaf = 1 << kSkippedBottomTreeLevels; // Since kElementsPerLeaf is POT: static const size_t kElementsPerLeafMask = kElementsPerLeaf - 1; private: // The WebGLElementArrayCache that owns this tree: WebGLElementArrayCache& mParent; // The tree's internal data storage. Its length is 2 * (number of leaves) // because of its data layout explained in the above class comment. FallibleTArray<T> mTreeData; public: // Constructor. Takes a reference to the WebGLElementArrayCache that is to be // the parent. Does not initialize the tree. Should be followed by a call // to Update() to attempt initializing the tree. explicit WebGLElementArrayCacheTree(WebGLElementArrayCache& value) : mParent(value) { } T GlobalMaximum() const { return mTreeData[1]; } // returns the index of the parent node; if treeIndex=1 (the root node), // the return value is 0. static size_t ParentNode(size_t treeIndex) { MOZ_ASSERT(treeIndex > 1); return treeIndex >> 1; } static bool IsRightNode(size_t treeIndex) { MOZ_ASSERT(treeIndex > 1); return treeIndex & 1; } static bool IsLeftNode(size_t treeIndex) { MOZ_ASSERT(treeIndex > 1); return !IsRightNode(treeIndex); } static size_t SiblingNode(size_t treeIndex) { MOZ_ASSERT(treeIndex > 1); return treeIndex ^ 1; } static size_t LeftChildNode(size_t treeIndex) { MOZ_ASSERT(treeIndex); return treeIndex << 1; } static size_t RightChildNode(size_t treeIndex) { MOZ_ASSERT(treeIndex); return SiblingNode(LeftChildNode(treeIndex)); } static size_t LeftNeighborNode(size_t treeIndex, size_t distance = 1) { MOZ_ASSERT(treeIndex > 1); return treeIndex - distance; } static size_t RightNeighborNode(size_t treeIndex, size_t distance = 1) { MOZ_ASSERT(treeIndex > 1); return treeIndex + distance; } size_t NumLeaves() const { // See class comment for why we the tree storage size is 2 * numLeaves. return mTreeData.Length() >> 1; } size_t LeafForElement(size_t element) const { size_t leaf = element / kElementsPerLeaf; MOZ_ASSERT(leaf < NumLeaves()); return leaf; } size_t LeafForByte(size_t byte) const { return LeafForElement(byte / sizeof(T)); } // Returns the index, into the tree storage, where a given leaf is stored. size_t TreeIndexForLeaf(size_t leaf) const { // See above class comment. The tree storage is an array of length // 2 * numLeaves. The leaves are stored in its second half. return leaf + NumLeaves(); } static size_t LastElementUnderSameLeaf(size_t element) { return element | kElementsPerLeafMask; } static size_t FirstElementUnderSameLeaf(size_t element) { return element & ~kElementsPerLeafMask; } static size_t NextMultipleOfElementsPerLeaf(size_t numElements) { MOZ_ASSERT(numElements >= 1); return ((numElements - 1) | kElementsPerLeafMask) + 1; } bool Validate(T maxAllowed, size_t firstLeaf, size_t lastLeaf) { size_t firstTreeIndex = TreeIndexForLeaf(firstLeaf); size_t lastTreeIndex = TreeIndexForLeaf(lastLeaf); while (true) { // Given that we tweak these values in nontrivial ways, it doesn't // hurt to do this sanity check. MOZ_ASSERT(firstTreeIndex <= lastTreeIndex); // Final case where there is only one node to validate at the // current tree level: if (lastTreeIndex == firstTreeIndex) { const T& curData = mTreeData[firstTreeIndex]; return curData <= maxAllowed; } // If the first node at current tree level is a right node, handle // it individually and replace it with its right neighbor, which is // a left node. if (IsRightNode(firstTreeIndex)) { const T& curData = mTreeData[firstTreeIndex]; if (curData > maxAllowed) return false; firstTreeIndex = RightNeighborNode(firstTreeIndex); } // If the last node at current tree level is a left node, handle it // individually and replace it with its left neighbor, which is a // right node. if (IsLeftNode(lastTreeIndex)) { const T& curData = mTreeData[lastTreeIndex]; if (curData > maxAllowed) return false; lastTreeIndex = LeftNeighborNode(lastTreeIndex); } /* At this point it can happen that firstTreeIndex and lastTreeIndex * "crossed" eachother. That happens if firstTreeIndex was a right * node and lastTreeIndex was its right neighor: In that case, both * above tweaks happened and as a result, they ended up being * swapped: LastTreeIndex is now the _left_ neighbor of * firstTreeIndex. When that happens, there is nothing left to * validate. */ if (lastTreeIndex == LeftNeighborNode(firstTreeIndex)) return true; // Walk up one level. firstTreeIndex = ParentNode(firstTreeIndex); lastTreeIndex = ParentNode(lastTreeIndex); } } // Updates the tree from the parent's buffer contents. Fallible, as it // may have to resize the tree storage. bool Update(size_t firstByte, size_t lastByte); size_t SizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const { return mallocSizeOf(this) + mTreeData.ShallowSizeOfExcludingThis(mallocSizeOf); } }; // TreeForType: just a template helper to select the right tree object for a given // element type. template<typename T> struct TreeForType {}; template<> struct TreeForType<uint8_t> { static UniquePtr<WebGLElementArrayCacheTree<uint8_t>>& Value(WebGLElementArrayCache* b) { return b->mUint8Tree; } }; template<> struct TreeForType<uint16_t> { static UniquePtr<WebGLElementArrayCacheTree<uint16_t>>& Value(WebGLElementArrayCache* b) { return b->mUint16Tree; } }; template<> struct TreeForType<uint32_t> { static UniquePtr<WebGLElementArrayCacheTree<uint32_t>>& Value(WebGLElementArrayCache* b) { return b->mUint32Tree; } }; // Calling this method will 1) update the leaves in this interval // from the raw buffer data, and 2) propagate this update up the tree. template<typename T> bool WebGLElementArrayCacheTree<T>::Update(size_t firstByte, size_t lastByte) { MOZ_ASSERT(firstByte <= lastByte); MOZ_ASSERT(lastByte < mParent.mBytes.Length()); size_t numberOfElements = mParent.mBytes.Length() / sizeof(T); size_t requiredNumLeaves = 0; if (numberOfElements > 0) { /* If we didn't require the number of leaves to be a power of two, then * it would just be equal to * * ceil(numberOfElements / kElementsPerLeaf) * * The way we implement this (division+ceil) operation in integer * arithmetic * is as follows: */ size_t numLeavesNonPOT = (numberOfElements + kElementsPerLeaf - 1) / kElementsPerLeaf; // It only remains to round that up to the next power of two: requiredNumLeaves = RoundUpPow2(numLeavesNonPOT); } // Step #0: If needed, resize our tree data storage. if (requiredNumLeaves != NumLeaves()) { // See class comment for why we the tree storage size is 2 * numLeaves. if (!mTreeData.SetLength(2 * requiredNumLeaves, fallible)) { mTreeData.Clear(); return false; } MOZ_ASSERT(NumLeaves() == requiredNumLeaves); if (NumLeaves()) { // When resizing, update the whole tree, not just the subset // corresponding to the part of the buffer being updated. memset(mTreeData.Elements(), 0, mTreeData.Length() * sizeof(T)); firstByte = 0; lastByte = mParent.mBytes.Length() - 1; } } if (NumLeaves() == 0) return true; lastByte = std::min(lastByte, NumLeaves() * kElementsPerLeaf * sizeof(T) - 1); if (firstByte > lastByte) return true; size_t firstLeaf = LeafForByte(firstByte); size_t lastLeaf = LeafForByte(lastByte); MOZ_ASSERT(firstLeaf <= lastLeaf && lastLeaf < NumLeaves()); size_t firstTreeIndex = TreeIndexForLeaf(firstLeaf); size_t lastTreeIndex = TreeIndexForLeaf(lastLeaf); // Step #1: Initialize the tree leaves from plain buffer data. // That is, each tree leaf must be set to the max of the |kElementsPerLeaf| // corresponding buffer entries. // Condition-less scope to prevent leaking this scope's variables into the // code below: { // TreeIndex is the index of the tree leaf we're writing, i.e. the // destination index. size_t treeIndex = firstTreeIndex; // srcIndex is the index in the source buffer. size_t srcIndex = firstLeaf * kElementsPerLeaf; while (treeIndex <= lastTreeIndex) { T m = 0; size_t a = srcIndex; size_t srcIndexNextLeaf = std::min(a + kElementsPerLeaf, numberOfElements); for (; srcIndex < srcIndexNextLeaf; srcIndex++) { m = std::max(m, mParent.Element<T>(srcIndex)); } mTreeData[treeIndex] = m; treeIndex++; } } // Step #2: Propagate the values up the tree. This is simply a matter of // walking up the tree and setting each node to the max of its two children. while (firstTreeIndex > 1) { // Move up one level. firstTreeIndex = ParentNode(firstTreeIndex); lastTreeIndex = ParentNode(lastTreeIndex); // Fast-exit case where only one node is updated at the current level. if (firstTreeIndex == lastTreeIndex) { mTreeData[firstTreeIndex] = std::max(mTreeData[LeftChildNode(firstTreeIndex)], mTreeData[RightChildNode(firstTreeIndex)]); continue; } size_t child = LeftChildNode(firstTreeIndex); size_t parent = firstTreeIndex; while (parent <= lastTreeIndex) { T a = mTreeData[child]; child = RightNeighborNode(child); T b = mTreeData[child]; child = RightNeighborNode(child); mTreeData[parent] = std::max(a, b); parent = RightNeighborNode(parent); } } return true; } WebGLElementArrayCache::WebGLElementArrayCache() { } WebGLElementArrayCache::~WebGLElementArrayCache() { } bool WebGLElementArrayCache::BufferData(const void* ptr, size_t byteLength) { if (mBytes.Length() != byteLength) { if (!mBytes.SetLength(byteLength, fallible)) { mBytes.Clear(); return false; } } MOZ_ASSERT(mBytes.Length() == byteLength); return BufferSubData(0, ptr, byteLength); } bool WebGLElementArrayCache::BufferSubData(size_t pos, const void* ptr, size_t updateByteLength) { MOZ_ASSERT(pos + updateByteLength <= mBytes.Length()); if (!updateByteLength) return true; // Note, using memcpy on shared racy data is not well-defined, this // will need to use safe-for-races operations when those become available. // See bug 1225033. if (ptr) memcpy(mBytes.Elements() + pos, ptr, updateByteLength); else memset(mBytes.Elements() + pos, 0, updateByteLength); return UpdateTrees(pos, pos + updateByteLength - 1); } bool WebGLElementArrayCache::UpdateTrees(size_t firstByte, size_t lastByte) { bool result = true; if (mUint8Tree) result &= mUint8Tree->Update(firstByte, lastByte); if (mUint16Tree) result &= mUint16Tree->Update(firstByte, lastByte); if (mUint32Tree) result &= mUint32Tree->Update(firstByte, lastByte); return result; } template<typename T> bool WebGLElementArrayCache::Validate(uint32_t maxAllowed, size_t firstElement, size_t countElements) { // If maxAllowed is >= the max T value, then there is no way that a T index // could be invalid. uint32_t maxTSize = std::numeric_limits<T>::max(); if (maxAllowed >= maxTSize) return true; T maxAllowedT(maxAllowed); // Integer overflow must have been handled earlier, so we assert that // maxAllowedT is exactly the max allowed value. MOZ_ASSERT(uint32_t(maxAllowedT) == maxAllowed); if (!mBytes.Length() || !countElements) return true; UniquePtr<WebGLElementArrayCacheTree<T>>& tree = TreeForType<T>::Value(this); if (!tree) { tree = MakeUnique<WebGLElementArrayCacheTree<T>>(*this); if (mBytes.Length()) { bool valid = tree->Update(0, mBytes.Length() - 1); if (!valid) { // Do not assert here. This case would happen if an allocation // failed. We've already settled on fallible allocations around // here. tree = nullptr; return false; } } } size_t lastElement = firstElement + countElements - 1; // Fast-exit path when the global maximum for the whole element array buffer // falls in the allowed range: T globalMax = tree->GlobalMaximum(); if (globalMax <= maxAllowedT) return true; const T* elements = Elements<T>(); // Before calling tree->Validate, we have to validate ourselves the // boundaries of the elements span, to round them to the nearest multiple of // kElementsPerLeaf. size_t firstElementAdjustmentEnd = std::min(lastElement, tree->LastElementUnderSameLeaf(firstElement)); while (firstElement <= firstElementAdjustmentEnd) { const T& curData = elements[firstElement]; if (curData > maxAllowedT) return false; firstElement++; } size_t lastElementAdjustmentEnd = std::max(firstElement, tree->FirstElementUnderSameLeaf(lastElement)); while (lastElement >= lastElementAdjustmentEnd) { const T& curData = elements[lastElement]; if (curData > maxAllowedT) return false; lastElement--; } // at this point, for many tiny validations, we're already done. if (firstElement > lastElement) return true; // general case return tree->Validate(maxAllowedT, tree->LeafForElement(firstElement), tree->LeafForElement(lastElement)); } bool WebGLElementArrayCache::Validate(GLenum type, uint32_t maxAllowed, size_t firstElement, size_t countElements) { if (type == LOCAL_GL_UNSIGNED_BYTE) return Validate<uint8_t>(maxAllowed, firstElement, countElements); if (type == LOCAL_GL_UNSIGNED_SHORT) return Validate<uint16_t>(maxAllowed, firstElement, countElements); if (type == LOCAL_GL_UNSIGNED_INT) return Validate<uint32_t>(maxAllowed, firstElement, countElements); MOZ_ASSERT(false, "Invalid type."); return false; } template<typename T> static size_t SizeOfNullable(mozilla::MallocSizeOf mallocSizeOf, const T& obj) { if (!obj) return 0; return obj->SizeOfIncludingThis(mallocSizeOf); } size_t WebGLElementArrayCache::SizeOfIncludingThis(mozilla::MallocSizeOf mallocSizeOf) const { return mallocSizeOf(this) + mBytes.ShallowSizeOfExcludingThis(mallocSizeOf) + SizeOfNullable(mallocSizeOf, mUint8Tree) + SizeOfNullable(mallocSizeOf, mUint16Tree) + SizeOfNullable(mallocSizeOf, mUint32Tree); } bool WebGLElementArrayCache::BeenUsedWithMultipleTypes() const { // C++ Standard ($4.7) // "If the source type is bool, the value false is converted to zero and // the value true is converted to one." const int num_types_used = (mUint8Tree != nullptr) + (mUint16Tree != nullptr) + (mUint32Tree != nullptr); return num_types_used > 1; } } // end namespace mozilla