From 6168dbe21f5f83b906e562ea0ab232d499b275a6 Mon Sep 17 00:00:00 2001 From: "Matt A. Tobin" Date: Wed, 15 Jan 2020 14:56:04 -0500 Subject: Add java htmlparser sources that match the original 52-level state https://hg.mozilla.org/projects/htmlparser/ Commit: abe62ab2a9b69ccb3b5d8a231ec1ae11154c571d --- parser/html/java/htmlparser/ruby-gcj/DomUtils.java | 36 ++++ parser/html/java/htmlparser/ruby-gcj/README | 65 +++++++ parser/html/java/htmlparser/ruby-gcj/Rakefile | 77 ++++++++ parser/html/java/htmlparser/ruby-gcj/extconf.rb | 45 +++++ .../java/htmlparser/ruby-gcj/test/domencoding.rb | 5 + parser/html/java/htmlparser/ruby-gcj/test/fonts.rb | 11 ++ .../html/java/htmlparser/ruby-gcj/test/google.html | 10 + .../html/java/htmlparser/ruby-gcj/test/greek.xml | 2 + parser/html/java/htmlparser/ruby-gcj/validator.cpp | 210 +++++++++++++++++++++ 9 files changed, 461 insertions(+) create mode 100644 parser/html/java/htmlparser/ruby-gcj/DomUtils.java create mode 100644 parser/html/java/htmlparser/ruby-gcj/README create mode 100644 parser/html/java/htmlparser/ruby-gcj/Rakefile create mode 100644 parser/html/java/htmlparser/ruby-gcj/extconf.rb create mode 100644 parser/html/java/htmlparser/ruby-gcj/test/domencoding.rb create mode 100644 parser/html/java/htmlparser/ruby-gcj/test/fonts.rb create mode 100644 parser/html/java/htmlparser/ruby-gcj/test/google.html create mode 100644 parser/html/java/htmlparser/ruby-gcj/test/greek.xml create mode 100644 parser/html/java/htmlparser/ruby-gcj/validator.cpp (limited to 'parser/html/java/htmlparser/ruby-gcj') diff --git a/parser/html/java/htmlparser/ruby-gcj/DomUtils.java b/parser/html/java/htmlparser/ruby-gcj/DomUtils.java new file mode 100644 index 000000000..dc43da83d --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/DomUtils.java @@ -0,0 +1,36 @@ +import java.util.HashSet; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.Element; + +public class DomUtils { + + private static HashSet pinned_list = new HashSet(); + + public static synchronized void pin(Document d) { + pinned_list.add(d); + } + + public static synchronized void unpin(Document d) { + pinned_list.remove(d); + } + + // return all the text content contained by a single element + public static void getElementContent(Element e, StringBuffer b) { + for (Node n = e.getFirstChild(); n!=null; n=n.getNextSibling()) { + if (n.getNodeType() == n.TEXT_NODE) { + b.append(n.getNodeValue()); + } else if (n.getNodeType() == n.ELEMENT_NODE) { + getElementContent((Element) e, b); + } + } + } + + // replace all child nodes of a given element with a single text element + public static void setElementContent(Element e, String s) { + while (e.hasChildNodes()) { + e.removeChild(e.getFirstChild()); + } + e.appendChild(e.getOwnerDocument().createTextNode(s)); + } +} diff --git a/parser/html/java/htmlparser/ruby-gcj/README b/parser/html/java/htmlparser/ruby-gcj/README new file mode 100644 index 000000000..b368437f7 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/README @@ -0,0 +1,65 @@ +Disclaimer: + + This code is experimental. + + When some people say experimental, they mean "it may not do what it is + intended to do; in fact, it might even wipe out your hard drive". I mean + that too. But I mean something more than that. + + In this case, experimental means that I don't even know what it is intended + to do. I just have a vague vision, and I am trying out various things in + the hopes that one of them will work out. + +Vision: + + My vague vision is that I would like to see HTML 5 be a success. For me to + consider it to be a success, it needs to be a standard, be interoperable, + and be ubiquitous. + + I believe that the Validator.nu parser can be used to bootstrap that + process. It is written in Java. Has been compiled into JavaScript. Has + been translated into C++ based on the Mozilla libraries with the intent of + being included in Firefox. It very closely tracks to the standard. + + For the moment, the effort is on extending that to another language (Ruby) + on a single environment (i.e., Linux). Once that is complete, intent is to + evaluate the results, decide what needs to be changed, and what needs to be + done to support other languages and environments. + + The bar I'm setting for myself isn't just another SWIG generated low level + interface to a DOM, but rather a best of breed interface; which for Ruby + seems to be the one pioneered by Hpricot and adopted by Nokogiri. Success + will mean passing all of the tests from one of those two parsers as well as + all of the HTML5 tests. + +Build instructions: + + You'll need icu4j and chardet jars. If you checked out and ran dldeps you + are already all set: + + svn co http://svn.versiondude.net/whattf/build/trunk/ build + python build/build.py checkout dldeps + + Fedora 11: + + yum install ruby-devel rubygem-rake java-1.5.0-gcj-devel gcc-c++ + + Ubuntu 9.04: + + apt-get install ruby ruby1.8-dev rake gcj g++ + + Also at this time, you need to install a jdk (e.g. sun-java6-jdk), simply + because the javac that comes with gcj doesn't support -sourcepath, and + I haven't spent the time to find a replacement. + + Finally, make sure that libjaxp1.3-java is *not* installed. + + http://gcc.gnu.org/ml/java/2009-06/msg00055.html + + If this is done, you should be all set. + + cd htmlparser/ruby-gcj + rake test + + If things are successful, the last lines of the output will list the + font attributes and values found in the test/google.html file. diff --git a/parser/html/java/htmlparser/ruby-gcj/Rakefile b/parser/html/java/htmlparser/ruby-gcj/Rakefile new file mode 100644 index 000000000..7b5180253 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/Rakefile @@ -0,0 +1,77 @@ +deps = ENV['deps'] || '../../dependencies' +icu4j = "#{deps}/icu4j-4_0.jar" +chardet = "#{deps}/mozilla/intl/chardet/java/dist/lib/chardet.jar" +libgcj = Dir['/usr/share/java/libgcj*.jar'].grep(/gcj[-\d.]*jar$/).sort.last + +task :default => %w(headers libs Makefile validator.so) + +# headers + +hdb = 'nu/validator/htmlparser/dom/HtmlDocumentBuilder' +task :headers => %W(headers/DomUtils.h headers/#{hdb}.h) + +file 'headers/DomUtils.h' => 'DomUtils.java' do |t| + mkdir_p %w(classes headers), :verbose => false + sh "javac -d classes #{t.prerequisites.first}" + sh "gcjh -force -o #{t.name} -cp #{libgcj}:classes DomUtils" +end + +file "headers/#{hdb}.h" => "../src/#{hdb}.java" do |t| + mkdir_p %w(classes headers), :verbose => false + sh "javac -cp #{icu4j}:#{chardet} -d classes -sourcepath ../src " + + t.prerequisites.first + sh "gcjh -force -cp classes -o #{t.name} -cp #{libgcj}:classes " + + hdb.gsub('/','.') +end + +# libs + +task :libs => %w(htmlparser chardet icu).map {|name| "lib/libnu-#{name}.so"} + +htmlparser = Dir['../src/**/*.java'].reject {|name| name.include? '/xom/'} +file 'lib/libnu-htmlparser.so' => htmlparser + ['DomUtils.java'] do |t| + mkdir_p 'lib', :verbose => false + sh "gcj -shared --classpath=#{icu4j}:#{chardet} -fPIC " + + "-o #{t.name} #{t.prerequisites.join(' ')}" +end + +file 'lib/libnu-chardet.so' => chardet do |t| + mkdir_p 'lib', :verbose => false + sh "gcj -shared -fPIC -o #{t.name} #{t.prerequisites.join(' ')}" +end + +file 'lib/libnu-icu.so' => icu4j do |t| + mkdir_p 'lib', :verbose => false + sh "gcj -shared -fPIC -o #{t.name} #{t.prerequisites.join(' ')}" +end + +# module + +file 'Makefile' do + sh "ruby extconf.rb --with-gcj=#{libgcj}" +end + +file 'validator.so' => %w(Makefile validator.cpp headers/DomUtils.h) do + system 'make' +end + +file 'nu/validator.so' do + mkdir_p 'nu', :verbose => false + system 'ln -s -t nu ../validator.so' +end + +# tasks + +task :test => [:default, 'nu/validator.so'] do + ENV['LD_LIBRARY_PATH']='lib' + sh 'ruby test/fonts.rb test/google.html' +end + +task :clean do + rm_rf %W(classes lib nu mkmf.log headers/DomUtils.h headers/#{hdb}.h) + + Dir['*.o'] + Dir['*.so'] +end + +task :clobber => :clean do + rm_rf %w(headers Makefile) +end diff --git a/parser/html/java/htmlparser/ruby-gcj/extconf.rb b/parser/html/java/htmlparser/ruby-gcj/extconf.rb new file mode 100644 index 000000000..415cf430a --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/extconf.rb @@ -0,0 +1,45 @@ +require 'mkmf' + +# system dependencies +gcj = with_config('gcj', '/usr/share/java/libgcj.jar') + +# headers for JAXP +CONFIG['CC'] = 'g++' +with_cppflags('-xc++') do + + unless find_header('org/w3c/dom/Document.h', 'headers') + + `jar tf #{gcj}`.split.each do |file| + next unless file =~ /\.class$/ + next unless file =~ /^(javax|org)\/(w3c|xml)/ + next if file.include? '$' + + dest = 'headers/' + file.sub(/\.class$/,'.h') + name = file.sub(/\.class$/,'').gsub('/','.') + + next if File.exist? dest + + cmd = "gcjh -cp #{gcj} -o #{dest} #{name}" + puts cmd + break unless system cmd + system "ruby -pi -e '$_.sub!(/namespace namespace$/," + + "\"namespace namespace$\")' #{dest}" + system "ruby -pi -e '$_.sub!(/::namespace::/," + + "\"::namespace$::\")' #{dest}" + end + + exit unless find_header('org/w3c/dom/Document.h', 'headers') + end + + find_header 'nu/validator/htmlparser/dom/HtmlDocumentBuilder.h', 'headers' +end + +# Java libraries +Config::CONFIG['CC'] = 'g++ -shared' +dir_config('nu-htmlparser', nil, 'lib') +have_library 'nu-htmlparser' +have_library 'nu-icu' +have_library 'nu-chardet' + +# Ruby library +create_makefile 'nu/validator' diff --git a/parser/html/java/htmlparser/ruby-gcj/test/domencoding.rb b/parser/html/java/htmlparser/ruby-gcj/test/domencoding.rb new file mode 100644 index 000000000..1beb94c10 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/test/domencoding.rb @@ -0,0 +1,5 @@ +require 'nu/validator' + +ARGV.each do |arg| + puts Nu::Validator::parse(open(arg)).root.name +end diff --git a/parser/html/java/htmlparser/ruby-gcj/test/fonts.rb b/parser/html/java/htmlparser/ruby-gcj/test/fonts.rb new file mode 100644 index 000000000..595e3ae06 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/test/fonts.rb @@ -0,0 +1,11 @@ +require 'nu/validator' +require 'open-uri' + +ARGV.each do |arg| + doc = Nu::Validator::parse(open(arg)) + doc.xpath("//*[local-name()='font']").each do |font| + font.attributes.each do |name, attr| + puts "#{name} => #{attr.value}" + end + end +end diff --git a/parser/html/java/htmlparser/ruby-gcj/test/google.html b/parser/html/java/htmlparser/ruby-gcj/test/google.html new file mode 100644 index 000000000..8d2183b29 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/test/google.html @@ -0,0 +1,10 @@ +Google



 
  Advanced Search
  Preferences
  Language Tools

Find an opportunity to volunteer in your community today.


Advertising Programs - Business Solutions - About Google

©2009 - Privacy

\ No newline at end of file diff --git a/parser/html/java/htmlparser/ruby-gcj/test/greek.xml b/parser/html/java/htmlparser/ruby-gcj/test/greek.xml new file mode 100644 index 000000000..a14d23eb1 --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/test/greek.xml @@ -0,0 +1,2 @@ + + diff --git a/parser/html/java/htmlparser/ruby-gcj/validator.cpp b/parser/html/java/htmlparser/ruby-gcj/validator.cpp new file mode 100644 index 000000000..aadd24abe --- /dev/null +++ b/parser/html/java/htmlparser/ruby-gcj/validator.cpp @@ -0,0 +1,210 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nu/validator/htmlparser/dom/HtmlDocumentBuilder.h" + +#include "DomUtils.h" + +#include "ruby.h" + +using namespace java::io; +using namespace java::lang; +using namespace java::util; +using namespace javax::xml::parsers; +using namespace javax::xml::xpath; +using namespace nu::validator::htmlparser::dom; +using namespace org::w3c::dom; +using namespace org::xml::sax; + +static VALUE jaxp_Document; +static VALUE jaxp_Attr; +static VALUE jaxp_Element; +static ID ID_read; +static ID ID_doc; +static ID ID_element; + +// convert a Java string into a Ruby string +static VALUE j2r(String *string) { + if (string == NULL) return Qnil; + jint len = JvGetStringUTFLength(string); + char buf[len]; + JvGetStringUTFRegion(string, 0, len, buf); + return rb_str_new(buf, len); +} + +// convert a Ruby string into a Java string +static String *r2j(VALUE string) { + return JvNewStringUTF(RSTRING(string)->ptr); +} + +// release the Java Document associated with this Ruby Document +static void vnu_document_free(Document *doc) { + DomUtils::unpin(doc); +} + +// Nu::Validator::parse( string|file ) +static VALUE vnu_parse(VALUE self, VALUE input) { + HtmlDocumentBuilder *parser = new HtmlDocumentBuilder(); + + // read file-like objects into memory. TODO: buffer such objects + if (rb_respond_to(input, ID_read)) + input = rb_funcall(input, ID_read, 0); + + // convert input in to a ByteArrayInputStream + jbyteArray bytes = JvNewByteArray(RSTRING(input)->len); + memcpy(elements(bytes), RSTRING(input)->ptr, RSTRING(input)->len); + InputSource *source = new InputSource(new ByteArrayInputStream(bytes)); + + // parse, pin, and wrap + Document *doc = parser->parse(source); + DomUtils::pin(doc); + return Data_Wrap_Struct(jaxp_Document, NULL, vnu_document_free, doc); +} + +// Jaxp::parse( string|file ) +static VALUE jaxp_parse(VALUE self, VALUE input) { + DocumentBuilderFactory *factory = DocumentBuilderFactory::newInstance(); + DocumentBuilder *parser = factory->newDocumentBuilder(); + + // read file-like objects into memory. TODO: buffer such objects + if (rb_respond_to(input, ID_read)) + input = rb_funcall(input, ID_read, 0); + + try { + jbyteArray bytes = JvNewByteArray(RSTRING(input)->len); + memcpy(elements(bytes), RSTRING(input)->ptr, RSTRING(input)->len); + Document *doc = parser->parse(new ByteArrayInputStream(bytes)); + DomUtils::pin(doc); + return Data_Wrap_Struct(jaxp_Document, NULL, vnu_document_free, doc); + } catch (java::lang::Throwable *ex) { + ex->printStackTrace(); + return Qnil; + } +} + + +// Nu::Validator::Document#encoding +static VALUE jaxp_document_encoding(VALUE rdoc) { + Document *jdoc; + Data_Get_Struct(rdoc, Document, jdoc); + return j2r(jdoc->getXmlEncoding()); +} + +// Nu::Validator::Document#root +static VALUE jaxp_document_root(VALUE rdoc) { + Document *jdoc; + Data_Get_Struct(rdoc, Document, jdoc); + + Element *jelement = jdoc->getDocumentElement(); + if (jelement==NULL) return Qnil; + + VALUE relement = Data_Wrap_Struct(jaxp_Element, NULL, NULL, jelement); + rb_ivar_set(relement, ID_doc, rdoc); + return relement; +} + +// Nu::Validator::Document#xpath +static VALUE jaxp_document_xpath(VALUE rdoc, VALUE path) { + Document *jdoc; + Data_Get_Struct(rdoc, Document, jdoc); + + Element *jelement = jdoc->getDocumentElement(); + if (jelement==NULL) return Qnil; + + XPath *xpath = XPathFactory::newInstance()->newXPath(); + XPathExpression *expr = xpath->compile(r2j(path)); + NodeList *list = (NodeList*) expr->evaluate(jdoc, XPathConstants::NODESET); + + VALUE result = rb_ary_new(); + for (int i=0; igetLength(); i++) { + VALUE relement = Data_Wrap_Struct(jaxp_Element, NULL, NULL, list->item(i)); + rb_ivar_set(relement, ID_doc, rdoc); + rb_ary_push(result, relement); + } + return result; +} + +// Nu::Validator::Element#name +static VALUE jaxp_element_name(VALUE relement) { + Element *jelement; + Data_Get_Struct(relement, Element, jelement); + return j2r(jelement->getNodeName()); +} + +// Nu::Validator::Element#attributes +static VALUE jaxp_element_attributes(VALUE relement) { + Element *jelement; + Data_Get_Struct(relement, Element, jelement); + VALUE result = rb_hash_new(); + NamedNodeMap *map = jelement->getAttributes(); + for (int i=0; igetLength(); i++) { + Attr *jattr = (Attr *) map->item(i); + VALUE rattr = Data_Wrap_Struct(jaxp_Attr, NULL, NULL, jattr); + rb_ivar_set(rattr, ID_element, relement); + rb_hash_aset(result, j2r(jattr->getName()), rattr); + } + return result; +} + +// Nu::Validator::Attribute#value +static VALUE jaxp_attribute_value(VALUE rattribute) { + Attr *jattribute; + Data_Get_Struct(rattribute, Attr, jattribute); + return j2r(jattribute->getValue()); +} + +typedef VALUE (ruby_method)(...); + +// Nu::Validator module initialization +extern "C" void Init_validator() { + JvCreateJavaVM(NULL); + JvAttachCurrentThread(NULL, NULL); + JvInitClass(&DomUtils::class$); + JvInitClass(&XPathFactory::class$); + JvInitClass(&XPathConstants::class$); + + VALUE jaxp = rb_define_module("Jaxp"); + rb_define_singleton_method(jaxp, "parse", (ruby_method*)&jaxp_parse, 1); + + VALUE nu = rb_define_module("Nu"); + VALUE validator = rb_define_module_under(nu, "Validator"); + rb_define_singleton_method(validator, "parse", (ruby_method*)&vnu_parse, 1); + + jaxp_Document = rb_define_class_under(jaxp, "Document", rb_cObject); + rb_define_method(jaxp_Document, "encoding", + (ruby_method*)&jaxp_document_encoding, 0); + rb_define_method(jaxp_Document, "root", + (ruby_method*)&jaxp_document_root, 0); + rb_define_method(jaxp_Document, "xpath", + (ruby_method*)&jaxp_document_xpath, 1); + + jaxp_Element = rb_define_class_under(jaxp, "Element", rb_cObject); + rb_define_method(jaxp_Element, "name", + (ruby_method*)&jaxp_element_name, 0); + rb_define_method(jaxp_Element, "attributes", + (ruby_method*)&jaxp_element_attributes, 0); + + jaxp_Attr = rb_define_class_under(jaxp, "Attr", rb_cObject); + rb_define_method(jaxp_Attr, "value", + (ruby_method*)&jaxp_attribute_value, 0); + + ID_read = rb_intern("read"); + ID_doc = rb_intern("@doc"); + ID_element = rb_intern("@element"); +} -- cgit v1.2.3