diff options
Diffstat (limited to 'toolkit/components/microformats/test/lib/text.js')
-rw-r--r-- | toolkit/components/microformats/test/lib/text.js | 151 |
1 files changed, 151 insertions, 0 deletions
diff --git a/toolkit/components/microformats/test/lib/text.js b/toolkit/components/microformats/test/lib/text.js new file mode 100644 index 000000000..fe94dae0a --- /dev/null +++ b/toolkit/components/microformats/test/lib/text.js @@ -0,0 +1,151 @@ +/* + text + Extracts text string from DOM nodes. Was created to extract text in a whitespace-normalized form. + It works like a none-CSS aware version of IE's innerText function. DO NOT replace this module + with functions such as textContent as it will reduce the quality of data provided to the API user. + + Copyright (C) 2010 - 2015 Glenn Jones. All Rights Reserved. + MIT License: https://raw.github.com/glennjones/microformat-shiv/master/license.txt + Dependencies utilities.js, domutils.js +*/ + + +var Modules = (function (modules) { + + + modules.text = { + + // normalised or whitespace or whitespacetrimmed + textFormat: 'whitespacetrimmed', + + // block level tags, used to add line returns + blockLevelTags: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table', + 'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div', + 'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'header', 'hgroup', 'hr', + 'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea', + 'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details'], + + // tags to exclude + excludeTags: ['noframe', 'noscript', 'template', 'script', 'style', 'frames', 'frameset'], + + + /** + * parses the text from the DOM Node + * + * @param {DOM Node} node + * @param {String} textFormat + * @return {String} + */ + parse: function(doc, node, textFormat){ + var out; + this.textFormat = (textFormat)? textFormat : this.textFormat; + if(this.textFormat === 'normalised'){ + out = this.walkTreeForText( node ); + if(out !== undefined){ + return this.normalise( doc, out ); + }else{ + return ''; + } + }else{ + return this.formatText( doc, modules.domUtils.textContent(node), this.textFormat ); + } + }, + + + /** + * parses the text from a html string + * + * @param {DOM Document} doc + * @param {String} text + * @param {String} textFormat + * @return {String} + */ + parseText: function( doc, text, textFormat ){ + var node = modules.domUtils.createNodeWithText( 'div', text ); + return this.parse( doc, node, textFormat ); + }, + + + /** + * parses the text from a html string - only for whitespace or whitespacetrimmed formats + * + * @param {String} text + * @param {String} textFormat + * @return {String} + */ + formatText: function( doc, text, textFormat ){ + this.textFormat = (textFormat)? textFormat : this.textFormat; + if(text){ + var out = '', + regex = /(<([^>]+)>)/ig; + + out = text.replace(regex, ''); + if(this.textFormat === 'whitespacetrimmed') { + out = modules.utils.trimWhitespace( out ); + } + + //return entities.decode( out, 2 ); + return modules.domUtils.decodeEntities( doc, out ); + }else{ + return ''; + } + }, + + + /** + * normalises whitespace in given text + * + * @param {String} text + * @return {String} + */ + normalise: function( doc, text ){ + text = text.replace( / /g, ' ') ; // exchanges html entity for space into space char + text = modules.utils.collapseWhiteSpace( text ); // removes linefeeds, tabs and addtional spaces + text = modules.domUtils.decodeEntities( doc, text ); // decode HTML entities + text = text.replace( '–', '-' ); // correct dash decoding + return modules.utils.trim( text ); + }, + + + /** + * walks DOM tree parsing the text from DOM Nodes + * + * @param {DOM Node} node + * @return {String} + */ + walkTreeForText: function( node ) { + var out = '', + j = 0; + + if(node.tagName && this.excludeTags.indexOf( node.tagName.toLowerCase() ) > -1){ + return out; + } + + // if node is a text node get its text + if(node.nodeType && node.nodeType === 3){ + out += modules.domUtils.getElementText( node ); + } + + // get the text of the child nodes + if(node.childNodes && node.childNodes.length > 0){ + for (j = 0; j < node.childNodes.length; j++) { + var text = this.walkTreeForText( node.childNodes[j] ); + if(text !== undefined){ + out += text; + } + } + } + + // if it's a block level tag add an additional space at the end + if(node.tagName && this.blockLevelTags.indexOf( node.tagName.toLowerCase() ) !== -1){ + out += ' '; + } + + return (out === '')? undefined : out ; + } + + }; + + return modules; + +} (Modules || {})); |