diff options
author | Matt A. Tobin <email@mattatobin.com> | 2020-01-15 14:56:04 -0500 |
---|---|---|
committer | Matt A. Tobin <email@mattatobin.com> | 2020-01-15 14:56:04 -0500 |
commit | 6168dbe21f5f83b906e562ea0ab232d499b275a6 (patch) | |
tree | 658a4b27554c85ebcaad655fc83f2c2bb99e8e80 /parser/html/java/htmlparser/test-src | |
parent | 09314667a692fedff8564fc347c8a3663474faa6 (diff) | |
download | UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.gz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.lz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.tar.xz UXP-6168dbe21f5f83b906e562ea0ab232d499b275a6.zip |
Add java htmlparser sources that match the original 52-level state
https://hg.mozilla.org/projects/htmlparser/
Commit: abe62ab2a9b69ccb3b5d8a231ec1ae11154c571d
Diffstat (limited to 'parser/html/java/htmlparser/test-src')
28 files changed, 3588 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/Big5Tester.java b/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/Big5Tester.java new file mode 100644 index 000000000..395f9eb15 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/Big5Tester.java @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.encoding.test; + +import nu.validator.encoding.Encoding; + +public class Big5Tester extends EncodingTester { + + public static void main(String[] args) { + new Big5Tester().test(); + } + + private void test() { + // ASCII + decodeBig5("\u6162", "\u0061\u0062"); + // Edge cases + decodeBig5("\u8740", "\u43F0"); + decodeBig5("\uFEFE", "\u79D4"); + decodeBig5("\uFEFD", "\uD864\uDD0D"); + decodeBig5("\u8862", "\u00CA\u0304"); + decodeBig5("\u8864", "\u00CA\u030C"); + decodeBig5("\u8866", "\u00CA"); + decodeBig5("\u88A3", "\u00EA\u0304"); + decodeBig5("\u88A5", "\u00EA\u030C"); + decodeBig5("\u88A7", "\u00EA"); + decodeBig5("\u99D4", "\u8991"); + decodeBig5("\u99D5", "\uD85E\uDD67"); + decodeBig5("\u99D6", "\u8A29"); + // Edge cases surrounded with ASCII + decodeBig5("\u6187\u4062", "\u0061\u43F0\u0062"); + decodeBig5("\u61FE\uFE62", "\u0061\u79D4\u0062"); + decodeBig5("\u61FE\uFD62", "\u0061\uD864\uDD0D\u0062"); + decodeBig5("\u6188\u6262", "\u0061\u00CA\u0304\u0062"); + decodeBig5("\u6188\u6462", "\u0061\u00CA\u030C\u0062"); + decodeBig5("\u6188\u6662", "\u0061\u00CA\u0062"); + decodeBig5("\u6188\uA362", "\u0061\u00EA\u0304\u0062"); + decodeBig5("\u6188\uA562", "\u0061\u00EA\u030C\u0062"); + decodeBig5("\u6188\uA762", "\u0061\u00EA\u0062"); + decodeBig5("\u6199\uD462", "\u0061\u8991\u0062"); + decodeBig5("\u6199\uD562", "\u0061\uD85E\uDD67\u0062"); + decodeBig5("\u6199\uD662", "\u0061\u8A29\u0062"); + // Bad sequences + decodeBig5("\u8061", "\uFFFD\u0061"); + decodeBig5("\uFF61", "\uFFFD\u0061"); + decodeBig5("\uFE39", "\uFFFD\u0039"); + decodeBig5("\u8766", "\uFFFD\u0066"); + decodeBig5("\u8140", "\uFFFD\u0040"); + decodeBig5("\u6181", "\u0061\uFFFD"); + + // ASCII + encodeBig5("\u0061\u0062", "\u6162"); + // Edge cases + encodeBig5("\u9EA6\u0061", "\u3F61"); + encodeBig5("\uD858\uDE6B\u0061", "\u3F61"); + encodeBig5("\u3000", "\uA140"); + encodeBig5("\u20AC", "\uA3E1"); + encodeBig5("\u4E00", "\uA440"); + encodeBig5("\uD85D\uDE07", "\uC8A4"); + encodeBig5("\uFFE2", "\uC8CD"); + encodeBig5("\u79D4", "\uFEFE"); + // Not in index + encodeBig5("\u2603\u0061", "\u3F61"); + // duplicate low bits + encodeBig5("\uD840\uDFB5", "\uFD6A"); + // prefer last + encodeBig5("\u2550", "\uF9F9"); + } + + private void decodeBig5(String input, String expectation) { + decode(input, expectation, Encoding.BIG5); + } + + private void encodeBig5(String input, String expectation) { + encode(input, expectation, Encoding.BIG5); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/EncodingTester.java b/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/EncodingTester.java new file mode 100644 index 000000000..a910a01e9 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/encoding/test/EncodingTester.java @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.encoding.test; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; + +import nu.validator.encoding.Encoding; + +public class EncodingTester { + + protected byte[] stringToBytes(String str) { + byte[] bytes = new byte[str.length() * 2]; + for (int i = 0; i < str.length(); i++) { + int pair = (int) str.charAt(i); + bytes[i * 2] = (byte) (pair >> 8); + bytes[i * 2 + 1] = (byte) (pair & 0xFF); + } + return bytes; + } + + protected void decode(String input, String expectation, Encoding encoding) { + // Use the convenience method from Charset + + byte[] bytes = stringToBytes(input); + ByteBuffer byteBuf = ByteBuffer.wrap(bytes); + CharBuffer charBuf = encoding.decode(byteBuf); + + if (charBuf.remaining() != expectation.length()) { + err("When decoding from a single long buffer, the output length was wrong. Expected: " + + expectation.length() + ", got: " + charBuf.remaining(), + bytes, expectation); + return; + } + + for (int i = 0; i < expectation.length(); i++) { + char expect = expectation.charAt(i); + char actual = charBuf.get(); + if (actual != expect) { + err("When decoding from a single long buffer, failed at position " + + i + + ", expected: " + + charToHex(expect) + + ", got: " + + charToHex(actual), bytes, expectation); + return; + } + } + + // Decode with a 1-byte input buffer + + byteBuf = ByteBuffer.allocate(1); + charBuf = CharBuffer.allocate(expectation.length() + 2); + CharsetDecoder decoder = encoding.newDecoder(); + decoder.onMalformedInput(CodingErrorAction.REPLACE); + for (int i = 0; i < bytes.length; i++) { + byteBuf.position(0); + byteBuf.put(bytes[i]); + byteBuf.position(0); + CoderResult result = decoder.decode(byteBuf, charBuf, + (i + 1) == bytes.length); + if (result.isMalformed()) { + err("Decoder reported a malformed sequence when asked to replace at index: " + + i, bytes, expectation); + return; + } else if (result.isUnmappable()) { + err("Decoder claimed unmappable sequence, which none of these decoders should do.", + bytes, expectation); + return; + } else if (result.isOverflow()) { + err("Decoder claimed overflow when the output buffer is know to be large enough.", + bytes, expectation); + } else if (!result.isUnderflow()) { + err("Bogus coder result, expected underflow.", bytes, + expectation); + } + } + CoderResult result = decoder.flush(charBuf); + if (result.isMalformed()) { + err("Decoder reported a malformed sequence when asked to replace when flushing.", + bytes, expectation); + return; + } else if (result.isUnmappable()) { + err("Decoder claimed unmappable sequence when flushing, which none of these decoders should do.", + bytes, expectation); + return; + } else if (result.isOverflow()) { + err("Decoder claimed overflow when flushing when the output buffer is know to be large enough.", + bytes, expectation); + } else if (!result.isUnderflow()) { + err("Bogus coder result when flushing, expected underflow.", bytes, + expectation); + } + + charBuf.limit(charBuf.position()); + charBuf.position(0); + + for (int i = 0; i < expectation.length(); i++) { + char expect = expectation.charAt(i); + char actual = charBuf.get(); + if (actual != expect) { + err("When decoding one byte at a time in REPORT mode, failed at position " + + i + + ", expected: " + + charToHex(expect) + + ", got: " + + charToHex(actual), bytes, expectation); + return; + } + } + + // Decode with 1-char output buffer + + byteBuf = ByteBuffer.wrap(bytes); + charBuf = CharBuffer.allocate(1); + + decoder.reset(); // Let's test this while at it + decoder.onMalformedInput(CodingErrorAction.REPLACE); + int codeUnitPos = 0; + while (byteBuf.hasRemaining()) { + charBuf.position(0); + charBuf.put('\u0000'); + charBuf.position(0); + result = decoder.decode(byteBuf, charBuf, false); + if (result.isMalformed()) { + err("Decoder reported a malformed sequence when asked to replace at index (decoding one output code unit at a time): " + + byteBuf.position(), bytes, expectation); + return; + } else if (result.isUnmappable()) { + err("Decoder claimed unmappable sequence (decoding one output code unit at a time), which none of these decoders should do.", + bytes, expectation); + return; + } else if (result.isUnderflow()) { + if (byteBuf.hasRemaining()) { + err("When decoding one output code unit at a time, decoder claimed underflow when there was input remaining.", + bytes, expectation); + return; + } + } else if (!result.isOverflow()) { + err("Bogus coder result, expected overflow.", bytes, + expectation); + } + if (charBuf.position() == 1) { + charBuf.position(0); + char actual = charBuf.get(); + char expect = expectation.charAt(codeUnitPos); + if (actual != expect) { + err("When decoding one output code unit at a time in REPLACE mode, failed at position " + + byteBuf.position() + + ", expected: " + + charToHex(expect) + ", got: " + charToHex(actual), + bytes, expectation); + return; + } + codeUnitPos++; + } + } + + charBuf.position(0); + charBuf.put('\u0000'); + charBuf.position(0); + result = decoder.decode(byteBuf, charBuf, true); + + if (charBuf.position() == 1) { + charBuf.position(0); + char actual = charBuf.get(); + char expect = expectation.charAt(codeUnitPos); + if (actual != expect) { + err("When decoding one output code unit at a time in REPLACE mode, failed at position " + + byteBuf.position() + + ", expected: " + + charToHex(expect) + ", got: " + charToHex(actual), + bytes, expectation); + return; + } + codeUnitPos++; + } + + charBuf.position(0); + charBuf.put('\u0000'); + charBuf.position(0); + result = decoder.flush(charBuf); + if (result.isMalformed()) { + err("Decoder reported a malformed sequence when asked to replace when flushing (one output at a time).", + bytes, expectation); + return; + } else if (result.isUnmappable()) { + err("Decoder claimed unmappable sequence when flushing, which none of these decoders should do (one output at a time).", + bytes, expectation); + return; + } else if (result.isOverflow()) { + err("Decoder claimed overflow when flushing when the output buffer is know to be large enough (one output at a time).", + bytes, expectation); + } else if (!result.isUnderflow()) { + err("Bogus coder result when flushing, expected underflow (one output at a time).", + bytes, expectation); + } + + if (charBuf.position() == 1) { + charBuf.position(0); + char actual = charBuf.get(); + char expect = expectation.charAt(codeUnitPos); + if (actual != expect) { + err("When decoding one output code unit at a time in REPLACE mode, failed when flushing, expected: " + + charToHex(expect) + ", got: " + charToHex(actual), + bytes, expectation); + return; + } + } + + // TODO: 2 bytes at a time starting at 0 and 2 bytes at a time starting + // at 1 + } + + protected void encode(String input, String expectation, Encoding encoding) { + byte[] expectedBytes = stringToBytes(expectation); + CharBuffer charBuf = CharBuffer.wrap(input); + + // Use the convenience method from Charset + + ByteBuffer byteBuf = encoding.encode(charBuf); + + if (byteBuf.remaining() != expectedBytes.length) { + err("When encoding from a single long buffer, the output length was wrong. Expected: " + + expectedBytes.length + ", got: " + byteBuf.remaining(), + input, expectedBytes); + return; + } + + for (int i = 0; i < expectedBytes.length; i++) { + byte expect = expectedBytes[i]; + byte actual = byteBuf.get(); + if (actual != expect) { + err("When encoding from a single long buffer, failed at position " + + i + + ", expected: " + + byteToHex(expect) + + ", got: " + + byteToHex(actual), input, expectedBytes); + return; + } + } + + // Encode with a 1-char input buffer + + charBuf = CharBuffer.allocate(1); + byteBuf = ByteBuffer.allocate(expectedBytes.length + 2); + CharsetEncoder encoder = encoding.newEncoder(); + encoder.onMalformedInput(CodingErrorAction.REPLACE); + encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + for (int i = 0; i < input.length(); i++) { + charBuf.position(0); + charBuf.put(input.charAt(i)); + charBuf.position(0); + CoderResult result = encoder.encode(charBuf, byteBuf, + (i + 1) == input.length()); + if (result.isMalformed()) { + err("Encoder reported a malformed sequence when asked to replace at index: " + + i, input, expectedBytes); + return; + } else if (result.isUnmappable()) { + err("Encoder reported an upmappable sequence when asked to replace at index: " + + i, input, expectedBytes); + return; + } else if (result.isOverflow()) { + err("Encoder claimed overflow when the output buffer is know to be large enough.", + input, expectedBytes); + } else if (!result.isUnderflow()) { + err("Bogus coder result, expected underflow.", input, + expectedBytes); + } + } + CoderResult result = encoder.flush(byteBuf); + if (result.isMalformed()) { + err("Encoder reported a malformed sequence when asked to replace when flushing.", + input, expectedBytes); + return; + } else if (result.isUnmappable()) { + err("Encoder reported an unmappable sequence when asked to replace when flushing.", + input, expectedBytes); + return; + } else if (result.isOverflow()) { + err("Encoder claimed overflow when flushing when the output buffer is know to be large enough.", + input, expectedBytes); + } else if (!result.isUnderflow()) { + err("Bogus coder result when flushing, expected underflow.", input, + expectedBytes); + + } + + byteBuf.limit(byteBuf.position()); + byteBuf.position(0); + + for (int i = 0; i < expectedBytes.length; i++) { + byte expect = expectedBytes[i]; + byte actual = byteBuf.get(); + if (actual != expect) { + err("When encoding one char at a time in REPORT mode, failed at position " + + i + + ", expected: " + + byteToHex(expect) + + ", got: " + + byteToHex(actual), input, expectedBytes); + return; + } + } + + // Decode with 1-byte output buffer + + charBuf = CharBuffer.wrap(input); + byteBuf = ByteBuffer.allocate(1); + + encoder.reset(); // Let's test this while at it + encoder.onMalformedInput(CodingErrorAction.REPLACE); + encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + int bytePos = 0; + while (charBuf.hasRemaining()) { + byteBuf.position(0); + byteBuf.put((byte)0); + byteBuf.position(0); + result = encoder.encode(charBuf, byteBuf, false); + if (result.isMalformed()) { + err("Encoder reported a malformed sequence when asked to replace at index (decoding one output code unit at a time): " + + charBuf.position(), input, expectedBytes); + return; + } else if (result.isUnmappable()) { + err("Encoder reported an unmappable sequence when asked to replace at index (decoding one output code unit at a time): " + + charBuf.position(), input, expectedBytes); + return; + } else if (result.isUnderflow()) { + if (charBuf.hasRemaining()) { + err("When encoding one output byte at a time, encoder claimed underflow when there was input remaining.", + input, expectedBytes); + return; + } + } else if (!result.isOverflow()) { + err("Bogus coder result, expected overflow.", input, expectedBytes); + } + if (byteBuf.position() == 1) { + byteBuf.position(0); + byte actual = byteBuf.get(); + byte expect = expectedBytes[bytePos]; + if (actual != expect) { + err("When encoding one output byte at a time in REPLACE mode, failed at position " + + charBuf.position() + + ", expected: " + + byteToHex(expect) + ", got: " + byteToHex(actual), + input, expectedBytes); + return; + } + bytePos++; + } + } + + byteBuf.position(0); + byteBuf.put((byte)0); + byteBuf.position(0); + result = encoder.encode(charBuf, byteBuf, true); + + if (byteBuf.position() == 1) { + byteBuf.position(0); + byte actual = byteBuf.get(); + byte expect = expectedBytes[bytePos]; + if (actual != expect) { + err("When encoding one output byte at a time in REPLACE mode, failed at position " + + charBuf.position() + + ", expected: " + + byteToHex(expect) + ", got: " + byteToHex(actual), + input, expectedBytes); + return; + } + bytePos++; + } + + byteBuf.position(0); + byteBuf.put((byte)0); + byteBuf.position(0); + result = encoder.flush(byteBuf); + if (result.isMalformed()) { + err("Encoder reported a malformed sequence when asked to replace when flushing (one output at a time).", + input, expectedBytes); + return; + } else if (result.isUnmappable()) { + err("Encoder reported an unmappable sequence when asked to replace when flushing (one output at a time).", + input, expectedBytes); + return; + } else if (result.isOverflow()) { + err("Encoder claimed overflow when flushing when the output buffer is know to be large enough (one output at a time).", + input, expectedBytes); + } else if (!result.isUnderflow()) { + err("Bogus coder result when flushing, expected underflow (one output at a time).", + input, expectedBytes); + } + + if (byteBuf.position() == 1) { + byteBuf.position(0); + byte actual = byteBuf.get(); + byte expect = expectedBytes[bytePos]; + if (actual != expect) { + err("When encoding one output code unit at a time in REPLACE mode, failed when flushing, expected: " + + byteToHex(expect) + ", got: " + byteToHex(actual), + input, expectedBytes); + return; + } + } + + // TODO: 2 bytes at a time starting at 0 and 2 bytes at a time starting + // at 1 + } + + private String charToHex(char c) { + String hex = Integer.toHexString(c); + switch (hex.length()) { + case 1: + return "000" + hex; + case 2: + return "00" + hex; + case 3: + return "0" + hex; + default: + return hex; + } + } + + private String byteToHex(byte b) { + String hex = Integer.toHexString(((int) b & 0xFF)); + switch (hex.length()) { + case 1: + return "0" + hex; + default: + return hex; + } + } + + private void err(String msg, byte[] bytes, String expectation) { + System.err.println(msg); + System.err.print("Input:"); + for (int i = 0; i < bytes.length; i++) { + System.err.print(' '); + System.err.print(byteToHex(bytes[i])); + } + System.err.println(); + System.err.print("Expect:"); + for (int i = 0; i < expectation.length(); i++) { + System.err.print(' '); + System.err.print(charToHex(expectation.charAt(i))); + } + System.err.println(); + } + + private void err(String msg, String chars, byte[] expectation) { + System.err.println(msg); + System.err.print("Input:"); + for (int i = 0; i < chars.length(); i++) { + System.err.print(' '); + System.err.print(charToHex(chars.charAt(i))); + } + System.err.println(); + System.err.print("Expect:"); + for (int i = 0; i < expectation.length; i++) { + System.err.print(' '); + System.err.print(byteToHex(expectation[i])); + } + System.err.println(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DecoderLoopTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DecoderLoopTester.java new file mode 100644 index 000000000..3337a6555 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DecoderLoopTester.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; + +import nu.validator.htmlparser.common.Heuristics; +import nu.validator.htmlparser.io.Encoding; +import nu.validator.htmlparser.io.HtmlInputStreamReader; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; + +public class DecoderLoopTester { + + private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10); + + private static final int NUMBER_OR_ASTRAL_CHARS = 24500; + + private void runTest(int padding) throws SAXException, IOException { + Encoding utf8 = Encoding.forName("UTF-8"); + char[] charArr = new char[1 + padding + 2 * NUMBER_OR_ASTRAL_CHARS]; + byte[] byteArr; + int i = 0; + charArr[i++] = '\uFEFF'; + for (int j = 0; j < padding; j++) { + charArr[i++] = 'x'; + } + for (int j = 0; j < NUMBER_OR_ASTRAL_CHARS; j++) { + int value = 0x10000 + j; + charArr[i++] = (char) (LEAD_OFFSET + (value >> 10)); + charArr[i++] = (char) (0xDC00 + (value & 0x3FF)); +// charArr[i++] = 'y'; +// charArr[i++] = 'z'; + + } + CharBuffer charBuffer = CharBuffer.wrap(charArr); + CharsetEncoder enc = utf8.newEncoder(); + enc.onMalformedInput(CodingErrorAction.REPORT); + enc.onUnmappableCharacter(CodingErrorAction.REPORT); + ByteBuffer byteBuffer = enc.encode(charBuffer); + byteArr = new byte[byteBuffer.limit()]; + byteBuffer.get(byteArr); + + ErrorHandler eh = new SystemErrErrorHandler(); + compare(new HtmlInputStreamReader(new ByteArrayInputStream(byteArr), eh, null, null, Heuristics.NONE), padding, charArr, byteArr); + compare(new HtmlInputStreamReader(new ByteArrayInputStream(byteArr), eh, null, null, utf8), padding, charArr, byteArr); + } + + /** + * @param padding + * @param charArr + * @param byteArr + * @throws SAXException + * @throws IOException + */ + private void compare(HtmlInputStreamReader reader, int padding, char[] charArr, byte[] byteArr) throws SAXException, IOException { + char[] readBuffer = new char[2048]; + int offset = 0; + int num = 0; + int readNum = 0; + while ((num = reader.read(readBuffer)) != -1) { + for (int j = 0; j < num; j++) { + System.out.println(offset + j); + if (readBuffer[j] != charArr[offset + j]) { + throw new RuntimeException("Test failed. Char: " + Integer.toHexString(readBuffer[j]) + " j: " + j + " readNum: " + readNum); + } + } + offset += num; + readNum++; + } + } + + void runTests() throws SAXException, IOException { + for (int i = 0; i < 4; i++) { + runTest(i); + } + } + + /** + * @param args + * @throws IOException + * @throws SAXException + */ + public static void main(String[] args) throws IOException, SAXException { + new DecoderLoopTester().runTests(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomIdTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomIdTester.java new file mode 100644 index 000000000..a3866f5d9 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomIdTester.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.IOException; +import java.io.StringReader; + +import org.w3c.dom.Document; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import nu.validator.htmlparser.dom.HtmlDocumentBuilder; + +public class DomIdTester { + + private static final String testSrc = "<div><h1 id='bar' class='foo'>buoeoa</h1><p id='foo'>uoeuo</p></div>"; + + /** + * @param args + * @throws IOException + * @throws SAXException + */ + public static void main(String[] args) throws SAXException, IOException { + HtmlDocumentBuilder builder = new HtmlDocumentBuilder(); + Document doc = builder.parse(new InputSource(new StringReader(testSrc))); + System.out.println(doc.getElementById("foo").getLocalName()); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomTest.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomTest.java new file mode 100644 index 000000000..07d054b9e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/DomTest.java @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2009 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; + +public class DomTest { + public static void main(String[] args) throws Exception { + DocumentBuilderFactory f = DocumentBuilderFactory.newInstance(); + f.setNamespaceAware(true); // not setting this causes pain and suffering with SVG + DocumentBuilder b = f.newDocumentBuilder(); + Document d = b.newDocument(); + Element e = d.createElementNS("http://www.w3.org/1999/xhtml", "html"); + e.setAttribute("xmlns:foo", "bar"); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/EncodingTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/EncodingTester.java new file mode 100644 index 000000000..95cd3018e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/EncodingTester.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; + +import nu.validator.htmlparser.common.Heuristics; +import nu.validator.htmlparser.io.Encoding; +import nu.validator.htmlparser.io.HtmlInputStreamReader; + +import org.xml.sax.SAXException; + +public class EncodingTester { + + private final InputStream aggregateStream; + + private final StringBuilder builder = new StringBuilder(); + + /** + * @param aggregateStream + */ + public EncodingTester(InputStream aggregateStream) { + this.aggregateStream = aggregateStream; + } + + private void runTests() throws IOException, SAXException { + while (runTest()) { + // spin + } + } + + private boolean runTest() throws IOException, SAXException { + if (skipLabel()) { + return false; + } + UntilHashInputStream stream = new UntilHashInputStream(aggregateStream); + HtmlInputStreamReader reader = new HtmlInputStreamReader(stream, null, + null, null, Heuristics.NONE); + Charset charset = reader.getCharset(); + stream.close(); + if (skipLabel()) { + System.err.println("Premature end of test data."); + return false; + } + builder.setLength(0); + loop: for (;;) { + int b = aggregateStream.read(); + switch (b) { + case '\n': + break loop; + case -1: + System.err.println("Premature end of test data."); + return false; + default: + builder.append(((char) b)); + } + } + String sniffed = charset.name(); + String expected = Encoding.forName(builder.toString()).newDecoder().charset().name(); + if (expected.equalsIgnoreCase(sniffed)) { + System.err.println("Success."); + // System.err.println(stream); + } else { + System.err.println("Failure. Expected: " + expected + " got " + + sniffed + "."); + System.err.println(stream); + } + return true; + } + + private boolean skipLabel() throws IOException { + int b = aggregateStream.read(); + if (b == -1) { + return true; + } + for (;;) { + b = aggregateStream.read(); + if (b == -1) { + return true; + } else if (b == 0x0A) { + return false; + } + } + } + + /** + * @param args + * @throws SAXException + * @throws IOException + */ + public static void main(String[] args) throws IOException, SAXException { + for (int i = 0; i < args.length; i++) { + EncodingTester tester = new EncodingTester(new FileInputStream( + args[i])); + tester.runTests(); + } + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java new file mode 100644 index 000000000..2fcfc4960 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/JSONArrayTokenHandler.java @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import nu.validator.htmlparser.common.TokenHandler; +import nu.validator.htmlparser.impl.ElementName; +import nu.validator.htmlparser.impl.HtmlAttributes; +import nu.validator.htmlparser.impl.Tokenizer; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import com.sdicons.json.model.JSONArray; +import com.sdicons.json.model.JSONBoolean; +import com.sdicons.json.model.JSONNull; +import com.sdicons.json.model.JSONObject; +import com.sdicons.json.model.JSONString; + +public class JSONArrayTokenHandler implements TokenHandler, ErrorHandler { + + private static final JSONString DOCTYPE = new JSONString("DOCTYPE"); + + private static final JSONString START_TAG = new JSONString("StartTag"); + + private static final JSONString END_TAG = new JSONString("EndTag"); + + private static final JSONString COMMENT = new JSONString("Comment"); + + private static final JSONString CHARACTER = new JSONString("Character"); + + private static final JSONString PARSE_ERROR = new JSONString("ParseError"); + + private static final char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; + + private final StringBuilder builder = new StringBuilder(); + + private JSONArray array = null; + + private int contentModelFlag; + + private String contentModelElement; + + public void setContentModelFlag(int contentModelFlag, String contentModelElement) { + this.contentModelFlag = contentModelFlag; + this.contentModelElement = contentModelElement; + } + + public void characters(char[] buf, int start, int length) + throws SAXException { + builder.append(buf, start, length); + } + + private void flushCharacters() { + if (builder.length() > 0) { + JSONArray token = new JSONArray(); + token.getValue().add(CHARACTER); + token.getValue().add(new JSONString(builder.toString())); + array.getValue().add(token); + builder.setLength(0); + } + } + + public void comment(char[] buf, int start, int length) throws SAXException { + flushCharacters(); + JSONArray token = new JSONArray(); + token.getValue().add(COMMENT); + token.getValue().add(new JSONString(new String(buf, start, length))); + array.getValue().add(token); + } + + public void doctype(String name, String publicIdentifier, String systemIdentifier, boolean forceQuirks) throws SAXException { + flushCharacters(); + JSONArray token = new JSONArray(); + token.getValue().add(DOCTYPE); + token.getValue().add(new JSONString(name)); + token.getValue().add(publicIdentifier == null ? JSONNull.NULL : new JSONString(publicIdentifier)); + token.getValue().add(systemIdentifier == null ? JSONNull.NULL : new JSONString(systemIdentifier)); + token.getValue().add(new JSONBoolean(!forceQuirks)); + array.getValue().add(token); + } + + public void endTag(ElementName eltName) throws SAXException { + String name = eltName.name; + flushCharacters(); + JSONArray token = new JSONArray(); + token.getValue().add(END_TAG); + token.getValue().add(new JSONString(name)); + array.getValue().add(token); + } + + public void eof() throws SAXException { + flushCharacters(); + } + + public void startTokenization(Tokenizer self) throws SAXException { + array = new JSONArray(); + if (contentModelElement != null) { + self.setStateAndEndTagExpectation(contentModelFlag, contentModelElement); + } + } + + public void startTag(ElementName eltName, HtmlAttributes attributes, + boolean selfClosing) throws SAXException { + String name = eltName.name; + flushCharacters(); + JSONArray token = new JSONArray(); + token.getValue().add(START_TAG); + token.getValue().add(new JSONString(name)); + JSONObject attrs = new JSONObject(); + for (int i = 0; i < attributes.getLength(); i++) { + attrs.getValue().put(attributes.getQNameNoBoundsCheck(i), + new JSONString(attributes.getValueNoBoundsCheck(i))); + } + token.getValue().add(attrs); + if (selfClosing) { + token.getValue().add(JSONBoolean.TRUE); + } + array.getValue().add(token); + } + + public boolean wantsComments() throws SAXException { + return true; + } + + public void error(SAXParseException exception) throws SAXException { + flushCharacters(); + array.getValue().add(PARSE_ERROR); + } + + public void fatalError(SAXParseException exception) throws SAXException { + throw new RuntimeException("Should never happen."); + } + + public void warning(SAXParseException exception) throws SAXException { + } + + /** + * Returns the array. + * + * @return the array + */ + public JSONArray getArray() { + return array; + } + + public void endTokenization() throws SAXException { + + } + + @Override public void zeroOriginatingReplacementCharacter() + throws SAXException { + builder.append(REPLACEMENT_CHARACTER, 0, 1); + } + + @Override public boolean cdataSectionAllowed() throws SAXException { + return false; + } + + @Override public void ensureBufferSpace(int inputLength) + throws SAXException { + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/ListErrorHandler.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/ListErrorHandler.java new file mode 100644 index 000000000..9a207f277 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/ListErrorHandler.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.util.LinkedList; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +public class ListErrorHandler implements ErrorHandler { + + private boolean fatal = false; + + private LinkedList<String> errors = new LinkedList<String>(); + + public void error(SAXParseException spe) throws SAXException { + errors.add(Integer.toString(spe.getColumnNumber()) + ": " + spe.getMessage()); + } + + public void fatalError(SAXParseException arg0) throws SAXException { + fatal = true; + } + + public void warning(SAXParseException arg0) throws SAXException { + } + + /** + * Returns the errors. + * + * @return the errors + */ + public LinkedList<String> getErrors() { + return errors; + } + + /** + * Returns the fatal. + * + * @return the fatal + */ + public boolean isFatal() { + return fatal; + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/SystemErrErrorHandler.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/SystemErrErrorHandler.java new file mode 100644 index 000000000..9ee490b9e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/SystemErrErrorHandler.java @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2005, 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.io.Writer; + +import javax.xml.transform.ErrorListener; +import javax.xml.transform.SourceLocator; +import javax.xml.transform.TransformerException; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +/** + * @version $Id$ + * @author hsivonen + */ +public class SystemErrErrorHandler implements ErrorHandler, ErrorListener { + + private Writer out; + + private boolean inError = false; + + public SystemErrErrorHandler() { + try { + out = new OutputStreamWriter(System.err, "UTF-8"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException(e); + } + } + + /** + * @see org.xml.sax.ErrorHandler#warning(org.xml.sax.SAXParseException) + */ + public void warning(SAXParseException e) throws SAXException { + try { + out.write("Warning:\n"); + out.write(e.getMessage()); + out.write("\nFile: "); + String systemId = e.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(e.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(e.getColumnNumber())); + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new SAXException(e1); + } + } + + /** + * @see org.xml.sax.ErrorHandler#error(org.xml.sax.SAXParseException) + */ + public void error(SAXParseException e) throws SAXException { + inError = true; + try { + out.write("Error:\n"); + out.write(e.getMessage()); + out.write("\nFile: "); + String systemId = e.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(e.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(e.getColumnNumber())); + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new SAXException(e1); + } + } + + /** + * @see org.xml.sax.ErrorHandler#fatalError(org.xml.sax.SAXParseException) + */ + public void fatalError(SAXParseException e) throws SAXException { + inError = true; + try { + out.write("Fatal Error:\n"); + out.write(e.getMessage()); + out.write("\nFile: "); + String systemId = e.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(e.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(e.getColumnNumber())); + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new SAXException(e1); + } + } + + /** + * Returns the inError. + * + * @return the inError + */ + public boolean isInError() { + return inError; + } + + public void reset() { + inError = false; + } + + public void error(TransformerException e) throws TransformerException { + inError = true; + try { + out.write("Error:\n"); + out.write(e.getMessage()); + SourceLocator sourceLocator = e.getLocator(); + if (sourceLocator != null) { + out.write("\nFile: "); + String systemId = sourceLocator.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(sourceLocator.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(sourceLocator.getColumnNumber())); + } + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new TransformerException(e1); + } + } + + public void fatalError(TransformerException e) + throws TransformerException { + inError = true; + try { + out.write("Fatal Error:\n"); + out.write(e.getMessage()); + SourceLocator sourceLocator = e.getLocator(); + if (sourceLocator != null) { + out.write("\nFile: "); + String systemId = sourceLocator.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(sourceLocator.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(sourceLocator.getColumnNumber())); + } + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new TransformerException(e1); + } + } + + public void warning(TransformerException e) + throws TransformerException { + try { + out.write("Warning:\n"); + out.write(e.getMessage()); + SourceLocator sourceLocator = e.getLocator(); + if (sourceLocator != null) { + out.write("\nFile: "); + String systemId = sourceLocator.getSystemId(); + out.write((systemId == null) ? "Unknown" : systemId); + out.write("\nLine: "); + out.write(Integer.toString(sourceLocator.getLineNumber())); + out.write(" Col: "); + out.write(Integer.toString(sourceLocator.getColumnNumber())); + } + out.write("\n\n"); + out.flush(); + } catch (IOException e1) { + throw new TransformerException(e1); + } + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenPrinter.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenPrinter.java new file mode 100644 index 000000000..0fa5972c8 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenPrinter.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; + +import nu.validator.htmlparser.common.TokenHandler; +import nu.validator.htmlparser.impl.ElementName; +import nu.validator.htmlparser.impl.ErrorReportingTokenizer; +import nu.validator.htmlparser.impl.HtmlAttributes; +import nu.validator.htmlparser.impl.Tokenizer; +import nu.validator.htmlparser.io.Driver; + +import org.xml.sax.ErrorHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +public class TokenPrinter implements TokenHandler, ErrorHandler { + + private final Writer writer; + + public void characters(char[] buf, int start, int length) + throws SAXException { + try { + boolean lineStarted = true; + writer.write('-'); + for (int i = start; i < start + length; i++) { + if (!lineStarted) { + writer.write("\n-"); + lineStarted = true; + } + char c = buf[i]; + if (c == '\n') { + writer.write("\\n"); + lineStarted = false; + } else { + writer.write(c); + } + } + writer.write('\n'); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void comment(char[] buf, int start, int length) throws SAXException { + try { + writer.write('!'); + writer.write(buf, start, length); + writer.write('\n'); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void doctype(String name, String publicIdentifier, String systemIdentifier, boolean forceQuirks) throws SAXException { + try { + writer.write('D'); + writer.write(name); + writer.write(' '); + writer.write("" + forceQuirks); + writer.write('\n'); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endTag(ElementName eltName) throws SAXException { + try { + writer.write(')'); + writer.write(eltName.name); + writer.write('\n'); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void eof() throws SAXException { + try { + writer.write("E\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void startTokenization(Tokenizer self) throws SAXException { + + } + + public void startTag(ElementName eltName, HtmlAttributes attributes, boolean selfClosing) + throws SAXException { + try { + writer.write('('); + writer.write(eltName.name); + writer.write('\n'); + for (int i = 0; i < attributes.getLength(); i++) { + writer.write('A'); + writer.write(attributes.getQNameNoBoundsCheck(i)); + writer.write(' '); + writer.write(attributes.getValueNoBoundsCheck(i)); + writer.write('\n'); + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public boolean wantsComments() throws SAXException { + return true; + } + + public static void main(String[] args) throws SAXException, IOException { + TokenPrinter printer = new TokenPrinter(new OutputStreamWriter(System.out, "UTF-8")); + Driver tokenizer = new Driver(new ErrorReportingTokenizer(printer)); + tokenizer.setErrorHandler(printer); + File file = new File(args[0]); + InputSource is = new InputSource(new FileInputStream(file)); + is.setSystemId(file.toURI().toASCIIString()); + tokenizer.tokenize(is); + } + + /** + * @param writer + */ + public TokenPrinter(final Writer writer) { + this.writer = writer; + } + + public void error(SAXParseException exception) throws SAXException { + try { + writer.write("R "); + writer.write(exception.getMessage()); + writer.write("\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void fatalError(SAXParseException exception) throws SAXException { + try { + writer.write("F "); + writer.write(exception.getMessage()); + writer.write("\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void warning(SAXParseException exception) throws SAXException { + try { + writer.write("W "); + writer.write(exception.getMessage()); + writer.write("\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endTokenization() throws SAXException { + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + throw new SAXException(e); + } + } + + @Override public void zeroOriginatingReplacementCharacter() + throws SAXException { + try { + writer.write("0\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + @Override public boolean cdataSectionAllowed() throws SAXException { + return false; + } + + @Override public void ensureBufferSpace(int inputLength) + throws SAXException { + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenizerTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenizerTester.java new file mode 100644 index 000000000..76ea7543a --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TokenizerTester.java @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.io.Writer; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.impl.ErrorReportingTokenizer; +import nu.validator.htmlparser.impl.Tokenizer; +import nu.validator.htmlparser.io.Driver; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import antlr.RecognitionException; +import antlr.TokenStreamException; + +import com.sdicons.json.model.JSONArray; +import com.sdicons.json.model.JSONObject; +import com.sdicons.json.model.JSONString; +import com.sdicons.json.model.JSONValue; +import com.sdicons.json.parser.JSONParser; + +public class TokenizerTester { + + private static JSONString PLAINTEXT = new JSONString("PLAINTEXT state"); + + private static JSONString PCDATA = new JSONString("DATA state"); + + private static JSONString RCDATA = new JSONString("RCDATA state"); + + private static JSONString RAWTEXT = new JSONString("RAWTEXT state"); + + private static boolean jsonDeepEquals(JSONValue one, JSONValue other) { + if (one.isSimple()) { + return one.equals(other); + } else if (one.isArray()) { + if (other.isArray()) { + JSONArray oneArr = (JSONArray) one; + JSONArray otherArr = (JSONArray) other; + return oneArr.getValue().equals(otherArr.getValue()); + } else { + return false; + } + } else if (one.isObject()) { + if (other.isObject()) { + JSONObject oneObject = (JSONObject) one; + JSONObject otherObject = (JSONObject) other; + return oneObject.getValue().equals(otherObject.getValue()); + } else { + return false; + } + } else { + throw new RuntimeException("Should never happen."); + } + } + + private JSONArray tests; + + private final JSONArrayTokenHandler tokenHandler; + + private final Driver driver; + + private final Writer writer; + + private TokenizerTester(InputStream stream) throws TokenStreamException, + RecognitionException, UnsupportedEncodingException { + tokenHandler = new JSONArrayTokenHandler(); + driver = new Driver(new ErrorReportingTokenizer(tokenHandler)); + driver.setCommentPolicy(XmlViolationPolicy.ALLOW); + driver.setContentNonXmlCharPolicy(XmlViolationPolicy.ALLOW); + driver.setContentSpacePolicy(XmlViolationPolicy.ALLOW); + driver.setNamePolicy(XmlViolationPolicy.ALLOW); + driver.setXmlnsPolicy(XmlViolationPolicy.ALLOW); + driver.setErrorHandler(tokenHandler); + writer = new OutputStreamWriter(System.out, "UTF-8"); + JSONParser jsonParser = new JSONParser(new InputStreamReader(stream, + "UTF-8")); + JSONObject obj = (JSONObject) jsonParser.nextValue(); + tests = (JSONArray) obj.get("tests"); + if (tests == null) { + tests = (JSONArray) obj.get("xmlViolationTests"); + driver.setCommentPolicy(XmlViolationPolicy.ALTER_INFOSET); + driver.setContentNonXmlCharPolicy(XmlViolationPolicy.ALTER_INFOSET); + driver.setNamePolicy(XmlViolationPolicy.ALTER_INFOSET); + driver.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET); + } + } + + private void runTests() throws SAXException, IOException { + for (JSONValue val : tests.getValue()) { + runTest((JSONObject) val); + } + writer.flush(); + } + + private void runTest(JSONObject test) throws SAXException, IOException { + String inputString = ((JSONString) test.get("input")).getValue(); + JSONArray expectedTokens = (JSONArray) test.get("output"); + String description = ((JSONString) test.get("description")).getValue(); + JSONString lastStartTagJSON = ((JSONString) test.get("lastStartTag")); + String lastStartTag = lastStartTagJSON == null ? null + : lastStartTagJSON.getValue(); + JSONArray contentModelFlags = (JSONArray) test.get("initialStates"); + if (contentModelFlags == null) { + runTestInner(inputString, expectedTokens, description, + Tokenizer.DATA, null); + } else { + for (JSONValue value : contentModelFlags.getValue()) { + if (PCDATA.equals(value)) { + runTestInner(inputString, expectedTokens, description, + Tokenizer.DATA, lastStartTag); + } else if (RAWTEXT.equals(value)) { + runTestInner(inputString, expectedTokens, description, + Tokenizer.RAWTEXT, lastStartTag); + } else if (RCDATA.equals(value)) { + runTestInner(inputString, expectedTokens, description, + Tokenizer.RCDATA, lastStartTag); + } else if (PLAINTEXT.equals(value)) { + runTestInner(inputString, expectedTokens, description, + Tokenizer.PLAINTEXT, lastStartTag); + } else { + throw new RuntimeException("Broken test data."); + } + } + } + } + + /** + * @param contentModelElement + * @param contentModelFlag + * @param test + * @throws SAXException + * @throws IOException + */ + private void runTestInner(String inputString, JSONArray expectedTokens, + String description, int contentModelFlag, + String contentModelElement) throws SAXException, IOException { + tokenHandler.setContentModelFlag(contentModelFlag, contentModelElement); + InputSource is = new InputSource(new StringReader(inputString)); + try { + driver.tokenize(is); + JSONArray actualTokens = tokenHandler.getArray(); + if (jsonDeepEquals(actualTokens, expectedTokens)) { + writer.write("Success\n"); + } else { + writer.write("Failure\n"); + writer.write(description); + writer.write("\nInput:\n"); + writer.write(inputString); + writer.write("\nExpected tokens:\n"); + writer.write(expectedTokens.render(false)); + writer.write("\nActual tokens:\n"); + writer.write(actualTokens.render(false)); + writer.write("\n"); + } + } catch (Throwable t) { + writer.write("Failure\n"); + writer.write(description); + writer.write("\nInput:\n"); + writer.write(inputString); + writer.write("\n"); + t.printStackTrace(new PrintWriter(writer, false)); + } + } + + /** + * @param args + * @throws RecognitionException + * @throws TokenStreamException + * @throws IOException + * @throws SAXException + */ + public static void main(String[] args) throws TokenStreamException, + RecognitionException, SAXException, IOException { + for (int i = 0; i < args.length; i++) { + TokenizerTester tester = new TokenizerTester(new FileInputStream( + args[i])); + tester.runTests(); + } + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeDumpContentHandler.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeDumpContentHandler.java new file mode 100644 index 000000000..9b95b763e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeDumpContentHandler.java @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.IOException; +import java.io.Writer; +import java.util.Map; +import java.util.TreeMap; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.ext.LexicalHandler; + +public class TreeDumpContentHandler implements ContentHandler, LexicalHandler { + + private final Writer writer; + + private int level = 0; + + private boolean inCharacters = false; + + private boolean close; + + /** + * @param writer + */ + public TreeDumpContentHandler(final Writer writer, boolean close) { + this.writer = writer; + this.close = close; + } + + public TreeDumpContentHandler(final Writer writer) { + this(writer, true); + } + + private void printLead() throws IOException { + if (inCharacters) { + writer.write("\"\n"); + inCharacters = false; + } + writer.write("| "); + for (int i = 0; i < level; i++) { + writer.write(" "); + } + } + + public void characters(char[] ch, int start, int length) + throws SAXException { + try { + if (!inCharacters) { + printLead(); + writer.write('"'); + inCharacters = true; + } + writer.write(ch, start, length); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endElement(String uri, String localName, String qName) + throws SAXException { + try { + if (inCharacters) { + writer.write("\"\n"); + inCharacters = false; + } + level--; + if ("http://www.w3.org/1999/xhtml" == uri && + "template" == localName) { + // decrement level for the "content" + level--; + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void startElement(String uri, String localName, String qName, + Attributes atts) throws SAXException { + try { + printLead(); + writer.write('<'); + if ("http://www.w3.org/1998/Math/MathML" == uri) { + writer.write("math "); + } else if ("http://www.w3.org/2000/svg" == uri) { + writer.write("svg "); + } else if ("http://www.w3.org/1999/xhtml" != uri) { + writer.write("otherns "); + } + writer.write(localName); + writer.write(">\n"); + level++; + TreeMap<String, String> map = new TreeMap<String, String>(); + for (int i = 0; i < atts.getLength(); i++) { + String ns = atts.getURI(i); + String name; + if ("http://www.w3.org/1999/xlink" == ns) { + name = "xlink " + atts.getLocalName(i); + } else if ("http://www.w3.org/XML/1998/namespace" == ns) { + name = "xml " + atts.getLocalName(i); + } else if ("http://www.w3.org/2000/xmlns/" == ns) { + name = "xmlns " + atts.getLocalName(i); + } else if ("" != uri) { + name = atts.getLocalName(i); + } else { + name = "otherns " + atts.getLocalName(i); + } + map.put(name, atts.getValue(i)); + } + for (Map.Entry<String, String> entry : map.entrySet()) { + printLead(); + writer.write(entry.getKey()); + writer.write("=\""); + writer.write(entry.getValue()); + writer.write("\"\n"); + } + if ("http://www.w3.org/1999/xhtml" == uri && + "template" == localName) { + printLead(); + level++; + writer.write("content\n"); + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void comment(char[] ch, int offset, int len) throws SAXException { + try { + printLead(); + writer.write("<!-- "); + writer.write(ch, offset, len); + writer.write(" -->\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void startDTD(String name, String publicIdentifier, + String systemIdentifier) throws SAXException { + try { + printLead(); + writer.write("<!DOCTYPE "); + writer.write(name); + if (publicIdentifier.length() > 0 || systemIdentifier.length() > 0) { + writer.write(' '); + writer.write('\"'); + writer.write(publicIdentifier); + writer.write('\"'); + writer.write(' '); + writer.write('\"'); + writer.write(systemIdentifier); + writer.write('\"'); + } + writer.write(">\n"); + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void endDocument() throws SAXException { + try { + if (inCharacters) { + writer.write("\"\n"); + inCharacters = false; + } + if (close) { + writer.flush(); + writer.close(); + } + } catch (IOException e) { + throw new SAXException(e); + } + } + + public void startPrefixMapping(String prefix, String uri) + throws SAXException { + } + + public void startEntity(String arg0) throws SAXException { + } + + public void endCDATA() throws SAXException { + } + + public void endDTD() throws SAXException { + } + + public void endEntity(String arg0) throws SAXException { + } + + public void startCDATA() throws SAXException { + } + + public void endPrefixMapping(String prefix) throws SAXException { + } + + public void ignorableWhitespace(char[] ch, int start, int length) + throws SAXException { + } + + public void processingInstruction(String target, String data) + throws SAXException { + } + + public void setDocumentLocator(Locator locator) { + } + + public void skippedEntity(String name) throws SAXException { + } + + public void startDocument() throws SAXException { + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreePrinter.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreePrinter.java new file mode 100644 index 000000000..c09169383 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreePrinter.java @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.HtmlParser; + +public class TreePrinter { + + public static void main(String[] args) throws SAXException, IOException { + TreeDumpContentHandler treeDumpContentHandler = new TreeDumpContentHandler(new OutputStreamWriter(System.out, "UTF-8")); + HtmlParser htmlParser = new HtmlParser(); + htmlParser.setContentHandler(treeDumpContentHandler); + htmlParser.setLexicalHandler(treeDumpContentHandler); + htmlParser.setErrorHandler(new SystemErrErrorHandler()); + htmlParser.setXmlPolicy(XmlViolationPolicy.ALLOW); + File file = new File(args[0]); + InputSource is = new InputSource(new FileInputStream(file)); + is.setSystemId(file.toURI().toASCIIString()); + htmlParser.parse(is); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeTester.java new file mode 100644 index 000000000..62d3ab530 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/TreeTester.java @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.StringWriter; +import java.util.LinkedList; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.HtmlParser; + +import org.xml.sax.InputSource; +import org.xml.sax.SAXParseException; + +public class TreeTester { + + private final BufferedInputStream aggregateStream; + + private boolean streaming = false; + + /** + * @param aggregateStream + */ + public TreeTester(InputStream aggregateStream) { + this.aggregateStream = new BufferedInputStream(aggregateStream); + } + + private void runTests() throws Throwable { + if (aggregateStream.read() != '#') { + System.err.println("No hash at start!"); + return; + } + while (runTest()) { + // spin + } + } + + private boolean runTest() throws Throwable { + UntilHashInputStream stream = null; + try { + String context = null; + boolean scriptingEnabled = true; + boolean hadScriptingDirective = false; + aggregateStream.mark(12288); + if (skipLabel()) { // #data + return false; + } + stream = new UntilHashInputStream(aggregateStream); + while (stream.read() != -1) { + // spin + } + if (skipLabel()) { // #errors + System.err.println("Premature end of test data."); + return false; + } + stream = new UntilHashInputStream(aggregateStream); + while (stream.read() != -1) { + // spin + } + + StringBuilder sb = new StringBuilder(); + int c; + while ((c = aggregateStream.read()) != '\n') { + sb.append((char) c); + } + String label = sb.toString(); + if ("document-fragment".equals(label)) { + sb.setLength(0); + while ((c = aggregateStream.read()) != '\n') { + sb.append((char) c); + } + context = sb.toString(); + // Now potentially gather #script-on/off + sb.setLength(0); + while ((c = aggregateStream.read()) != '\n') { + sb.append((char) c); + } + label = sb.toString(); + } + if ("script-on".equals(label)) { + hadScriptingDirective = true; + } else if ("script-off".equals(label)) { + hadScriptingDirective = true; + scriptingEnabled = false; + } + aggregateStream.reset(); + if (skipLabel()) { // #data + System.err.println("Premature end of test data."); + return false; + } + stream = new UntilHashInputStream(aggregateStream); + InputSource is = new InputSource(stream); + is.setEncoding("UTF-8"); + StringWriter sw = new StringWriter(); + ListErrorHandler leh = new ListErrorHandler(); + TreeDumpContentHandler treeDumpContentHandler = new TreeDumpContentHandler( + sw); + HtmlParser htmlParser = new HtmlParser(XmlViolationPolicy.ALLOW); + if (streaming) { + htmlParser.setStreamabilityViolationPolicy(XmlViolationPolicy.FATAL); + } + htmlParser.setContentHandler(treeDumpContentHandler); + htmlParser.setLexicalHandler(treeDumpContentHandler); + htmlParser.setErrorHandler(leh); + htmlParser.setScriptingEnabled(scriptingEnabled); + try { + if (context == null) { + htmlParser.parse(is); + } else { + String ns = "http://www.w3.org/1999/xhtml"; + if (context.startsWith("svg ")) { + ns = "http://www.w3.org/2000/svg"; + context = context.substring(4); + } else if (context.startsWith("math ")) { + ns = "http://www.w3.org/1998/Math/MathML"; + context = context.substring(5); + } + htmlParser.parseFragment(is, context, ns); + treeDumpContentHandler.endDocument(); + } + } catch (SAXParseException e) { + // ignore + } + stream.close(); + + if (skipLabel()) { // #errors + System.err.println("Premature end of test data."); + return false; + } + LinkedList<String> expectedErrors = new LinkedList<String>(); + BufferedReader br = new BufferedReader(new InputStreamReader( + new UntilHashInputStream(aggregateStream), "UTF-8")); + String line = null; + while ((line = br.readLine()) != null) { + expectedErrors.add(line); + } + + if (context != null) { + if (skipLabel()) { // #document-fragment + System.err.println("Premature end of test data."); + return false; + } + UntilHashInputStream stream2 = new UntilHashInputStream(aggregateStream); + while (stream2.read() != -1) { + // spin + } + } + if (hadScriptingDirective && skipLabel()) { // #script-on/off + System.err.println("Premature end of test data."); + return false; + } + + if (skipLabel()) { // #document + System.err.println("Premature end of test data."); + return false; + } + + StringBuilder expectedBuilder = new StringBuilder(); + br = new BufferedReader(new InputStreamReader( + new UntilHashInputStream(aggregateStream), "UTF-8")); + int ch; + while ((ch = br.read()) != -1) { + expectedBuilder.append((char)ch); + } + String expected = expectedBuilder.toString(); + String actual = sw.toString(); + + LinkedList<String> actualErrors = leh.getErrors(); + + if (expected.equals(actual) || (streaming && leh.isFatal()) /* + * && expectedErrors.size() == + * actualErrors.size() + */) { + System.err.println("Success."); + // System.err.println(stream); + } else { + System.err.print("Failure.\nData:\n" + stream + "\nExpected:\n" + + expected + "Got: \n" + actual); + System.err.println("Expected errors:"); + for (String err : expectedErrors) { + System.err.println(err); + } + System.err.println("Actual errors:"); + for (String err : actualErrors) { + System.err.println(err); + } + } + } catch (Throwable t) { + System.err.println("Failure.\nData:\n" + stream); + throw t; + } + return true; + } + + private boolean skipLabel() throws IOException { + int b = aggregateStream.read(); + if (b == -1) { + return true; + } + for (;;) { + b = aggregateStream.read(); + if (b == -1) { + return true; + } else if (b == 0x0A) { + return false; + } + } + } + + /** + * @param args + * @throws Throwable + */ + public static void main(String[] args) throws Throwable { + for (int i = 0; i < args.length; i++) { + TreeTester tester = new TreeTester(new FileInputStream(args[i])); + tester.runTests(); + } + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/UntilHashInputStream.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/UntilHashInputStream.java new file mode 100644 index 000000000..473a9f7f9 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/UntilHashInputStream.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import java.io.IOException; +import java.io.InputStream; + +public class UntilHashInputStream extends InputStream { + + private final StringBuilder builder = new StringBuilder(); + + private final InputStream delegate; + + private int buffer = -1; + + private boolean closed = false; + + /** + * @param delegate + * @throws IOException + */ + public UntilHashInputStream(final InputStream delegate) throws IOException { + this.delegate = delegate; + this.buffer = delegate.read(); + if (buffer == '#') { + closed = true; + } + } + + public int read() throws IOException { + if (closed) { + return -1; + } + int rv = buffer; + buffer = delegate.read(); + if (buffer == '#' && rv == '\n') { + // end of stream + closed = true; + return -1; + } else { + if (rv >= 0x20 && rv < 0x80) { + builder.append(((char)rv)); + } else { + builder.append("0x"); + builder.append(Integer.toHexString(rv)); + } + return rv; + } + } + + /** + * @see java.io.InputStream#close() + */ + @Override + public void close() throws IOException { + super.close(); + if (closed) { + return; + } + for (;;) { + int b = delegate.read(); + if (b == 0x23 || b == -1) { + break; + } + } + closed = true; + } + + /** + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return builder.toString(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XmlSerializerTester.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XmlSerializerTester.java new file mode 100644 index 000000000..0d23fda3c --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XmlSerializerTester.java @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import nu.validator.htmlparser.sax.XmlSerializer; + +public class XmlSerializerTester { + + + + /** + * @param args + * @throws SAXException + */ + public static void main(String[] args) throws SAXException { + AttributesImpl attrs = new AttributesImpl(); + XmlSerializer serializer = new XmlSerializer(System.out); + serializer.startDocument(); + serializer.startElement("1", "a", null, attrs); + serializer.startElement("1", "b", null, attrs); + serializer.endElement("1", "b", null); + serializer.startElement("2", "c", null, attrs); + serializer.endElement("2", "c", null); + attrs.addAttribute("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "about", null, "CDATA", ""); + serializer.startElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "d", null, attrs); + serializer.endElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "d", null); + serializer.startPrefixMapping("rdf", "foo"); + serializer.startElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "e", null, attrs); + serializer.startPrefixMapping("p0", "bar"); + serializer.startElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "f", null, attrs); + serializer.characters("a\uD834\uDD21a\uD834a\uDD21a".toCharArray(), 0, 8); + serializer.endElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "f", null); + serializer.endElement("http://www.w3.org/1999/02/22-rdf-syntax-ns#", "e", null); + + serializer.endPrefixMapping("rdf"); + serializer.endElement("1", "a", null); + serializer.endDocument(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XomTest.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XomTest.java new file mode 100644 index 000000000..66d706ae9 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/XomTest.java @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2009 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.test; + +import nu.xom.Attribute; +import nu.xom.Element; + +public class XomTest { + public static void main(String[] args) { + Element elt = new Element("html", "http://www.w3.org/1999/xhtml"); + elt.addAttribute(new Attribute("xmlns:foo", "bar")); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/package.html b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/package.html new file mode 100644 index 000000000..57809b84e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/test/package.html @@ -0,0 +1,29 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<html> +<head><title>Package Overview</title> +<!-- + Copyright (c) 2007 Henri Sivonen + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. +--> +</head> +<body bgcolor="white"> +<p>Test drivers.</p> +</body> +</html>
\ No newline at end of file diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2HTML.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2HTML.java new file mode 100644 index 000000000..5e2cf1f58 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2HTML.java @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.HtmlParser; +import nu.validator.htmlparser.sax.HtmlSerializer; +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.htmlparser.test.SystemErrErrorHandler; + +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class HTML2HTML { + + /** + * @param args + */ + public static void main(String[] args) throws SAXException, + ParserConfigurationException, MalformedURLException, IOException, + TransformerException { + InputStream in; + OutputStream out; + + switch (args.length) { + case 0: + in = System.in; + out = System.out; + break; + case 1: + in = new FileInputStream(args[0]); + out = System.out; + break; + case 2: + in = new FileInputStream(args[0]); + out = new FileOutputStream(args[1]); + break; + default: + System.err.println("Too many arguments. No arguments to use stdin/stdout. One argument to reading from file and write to stdout. Two arguments to read from first file and write to second."); + System.exit(1); + return; + } + + ContentHandler serializer = new HtmlSerializer(out); + + HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALLOW); + + parser.setErrorHandler(new SystemErrErrorHandler()); + parser.setContentHandler(serializer); + parser.setProperty("http://xml.org/sax/properties/lexical-handler", + serializer); + parser.parse(new InputSource(in)); + out.flush(); + out.close(); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2XML.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2XML.java new file mode 100644 index 000000000..57666f93b --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/HTML2XML.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerException; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.HtmlParser; +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.htmlparser.test.SystemErrErrorHandler; + +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +public class HTML2XML { + + /** + * @param args + */ + public static void main(String[] args) throws SAXException, + ParserConfigurationException, MalformedURLException, IOException, + TransformerException { + InputStream in; + OutputStream out; + + switch (args.length) { + case 0: + in = System.in; + out = System.out; + break; + case 1: + in = new FileInputStream(args[0]); + out = System.out; + break; + case 2: + in = new FileInputStream(args[0]); + out = new FileOutputStream(args[1]); + break; + default: + System.err.println("Too many arguments. No arguments to use stdin/stdout. One argument to reading from file and write to stdout. Two arguments to read from first file and write to second."); + System.exit(1); + return; + } + + ContentHandler serializer = new XmlSerializer(out); + + HtmlParser parser = new HtmlParser(XmlViolationPolicy.ALTER_INFOSET); + + parser.setErrorHandler(new SystemErrErrorHandler()); + parser.setContentHandler(serializer); + parser.setProperty("http://xml.org/sax/properties/lexical-handler", + serializer); + parser.parse(new InputSource(in)); + out.flush(); + out.close(); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2HTML.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2HTML.java new file mode 100644 index 000000000..dad89a5b2 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2HTML.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.TransformerException; + +import nu.validator.htmlparser.sax.HtmlSerializer; +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.htmlparser.test.SystemErrErrorHandler; + +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +public class XML2HTML { + + /** + * @param args + */ + public static void main(String[] args) throws SAXException, + ParserConfigurationException, MalformedURLException, IOException, + TransformerException { + InputStream in; + OutputStream out; + + switch (args.length) { + case 0: + in = System.in; + out = System.out; + break; + case 1: + in = new FileInputStream(args[0]); + out = System.out; + break; + case 2: + in = new FileInputStream(args[0]); + out = new FileOutputStream(args[1]); + break; + default: + System.err.println("Too many arguments. No arguments to use stdin/stdout. One argument to reading from file and write to stdout. Two arguments to read from first file and write to second."); + System.exit(1); + return; + } + + ContentHandler serializer = new HtmlSerializer(out); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + factory.setValidating(false); + XMLReader parser = factory.newSAXParser().getXMLReader(); + parser.setErrorHandler(new SystemErrErrorHandler()); + parser.setContentHandler(serializer); + parser.setProperty("http://xml.org/sax/properties/lexical-handler", + serializer); + parser.parse(new InputSource(in)); + out.flush(); + out.close(); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2XML.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2XML.java new file mode 100644 index 000000000..2f6aa24d8 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XML2XML.java @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2008 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.MalformedURLException; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.TransformerException; + +import nu.validator.htmlparser.sax.NameCheckingXmlSerializer; +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.htmlparser.test.SystemErrErrorHandler; + +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +public class XML2XML { + + /** + * @param args + */ + public static void main(String[] args) throws SAXException, + ParserConfigurationException, MalformedURLException, IOException, + TransformerException { + InputStream in; + OutputStream out; + + switch (args.length) { + case 0: + in = System.in; + out = System.out; + break; + case 1: + in = new FileInputStream(args[0]); + out = System.out; + break; + case 2: + in = new FileInputStream(args[0]); + out = new FileOutputStream(args[1]); + break; + default: + System.err.println("Too many arguments. No arguments to use stdin/stdout. One argument to reading from file and write to stdout. Two arguments to read from first file and write to second."); + System.exit(1); + return; + } + + ContentHandler serializer = new NameCheckingXmlSerializer(out); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + factory.setValidating(false); + XMLReader parser = factory.newSAXParser().getXMLReader(); + parser.setErrorHandler(new SystemErrErrorHandler()); + parser.setContentHandler(serializer); + parser.setProperty("http://xml.org/sax/properties/lexical-handler", + serializer); + parser.parse(new InputSource(in)); + out.flush(); + out.close(); + } +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5.java new file mode 100644 index 000000000..05d8193c1 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5.java @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2007 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.net.MalformedURLException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; +import javax.xml.transform.Templates; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.sax.SAXResult; +import javax.xml.transform.sax.SAXTransformerFactory; +import javax.xml.transform.sax.TemplatesHandler; +import javax.xml.transform.sax.TransformerHandler; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.dom.HtmlDocumentBuilder; +import nu.validator.htmlparser.sax.HtmlParser; +import nu.validator.htmlparser.sax.HtmlSerializer; +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.htmlparser.test.SystemErrErrorHandler; + +import org.w3c.dom.Document; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.ext.LexicalHandler; + +public class XSLT4HTML5 { + + private enum Mode { + STREAMING_SAX, BUFFERED_SAX, DOM, + } + + private static final String TEMPLATE = "--template="; + + private static final String INPUT_HTML = "--input-html="; + + private static final String INPUT_XML = "--input-xml="; + + private static final String OUTPUT_HTML = "--output-html="; + + private static final String OUTPUT_XML = "--output-xml="; + + private static final String MODE = "--mode="; + + /** + * @param args + * @throws ParserConfigurationException + * @throws SAXException + * @throws IOException + * @throws MalformedURLException + * @throws TransformerException + */ + public static void main(String[] args) throws SAXException, + ParserConfigurationException, MalformedURLException, IOException, TransformerException { + if (args.length == 0) { + System.out.println("--template=file --input-[html|xml]=file --output-[html|xml]=file --mode=[sax-streaming|sax-buffered|dom]"); + System.exit(0); + } + String template = null; + String input = null; + boolean inputHtml = false; + String output = null; + boolean outputHtml = false; + Mode mode = null; + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + if (arg.startsWith(TEMPLATE)) { + if (template == null) { + template = arg.substring(TEMPLATE.length()); + } else { + System.err.println("Tried to set template twice."); + System.exit(1); + } + } else if (arg.startsWith(INPUT_HTML)) { + if (input == null) { + input = arg.substring(INPUT_HTML.length()); + inputHtml = true; + } else { + System.err.println("Tried to set input twice."); + System.exit(2); + } + } else if (arg.startsWith(INPUT_XML)) { + if (input == null) { + input = arg.substring(INPUT_XML.length()); + inputHtml = false; + } else { + System.err.println("Tried to set input twice."); + System.exit(2); + } + } else if (arg.startsWith(OUTPUT_HTML)) { + if (output == null) { + output = arg.substring(OUTPUT_HTML.length()); + outputHtml = true; + } else { + System.err.println("Tried to set output twice."); + System.exit(3); + } + } else if (arg.startsWith(OUTPUT_XML)) { + if (output == null) { + output = arg.substring(OUTPUT_XML.length()); + outputHtml = false; + } else { + System.err.println("Tried to set output twice."); + System.exit(3); + } + } else if (arg.startsWith(MODE)) { + if (mode == null) { + String modeStr = arg.substring(MODE.length()); + if ("dom".equals(modeStr)) { + mode = Mode.DOM; + } else if ("sax-buffered".equals(modeStr)) { + mode = Mode.BUFFERED_SAX; + } else if ("sax-streaming".equals(modeStr)) { + mode = Mode.STREAMING_SAX; + } else { + System.err.println("Unrecognized mode."); + System.exit(5); + } + } else { + System.err.println("Tried to set mode twice."); + System.exit(4); + } + } + } + + if (template == null) { + System.err.println("No template specified."); + System.exit(6); + } + if (input == null) { + System.err.println("No input specified."); + System.exit(7); + } + if (output == null) { + System.err.println("No output specified."); + System.exit(8); + } + if (mode == null) { + mode = Mode.BUFFERED_SAX; + } + + SystemErrErrorHandler errorHandler = new SystemErrErrorHandler(); + + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + factory.setValidating(false); + XMLReader reader = factory.newSAXParser().getXMLReader(); + reader.setErrorHandler(errorHandler); + + SAXTransformerFactory transformerFactory = (SAXTransformerFactory) TransformerFactory.newInstance(); + transformerFactory.setErrorListener(errorHandler); + TemplatesHandler templatesHandler = transformerFactory.newTemplatesHandler(); + reader.setContentHandler(templatesHandler); + reader.parse(new File(template).toURI().toASCIIString()); + + Templates templates = templatesHandler.getTemplates(); + + FileOutputStream outputStream = new FileOutputStream(output); + ContentHandler serializer; + if (outputHtml) { + serializer = new HtmlSerializer(outputStream); + } else { + serializer = new XmlSerializer(outputStream); + } + SAXResult result = new SAXResult(new XmlnsDropper(serializer)); + result.setLexicalHandler((LexicalHandler) serializer); + + if (mode == Mode.DOM) { + Document inputDoc; + DocumentBuilder builder; + if (inputHtml) { + builder = new HtmlDocumentBuilder(XmlViolationPolicy.ALTER_INFOSET); + } else { + DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + try { + builder = builderFactory.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + throw new RuntimeException(e); + } + } + inputDoc = builder.parse(new File(input)); + DOMSource inputSource = new DOMSource(inputDoc, + new File(input).toURI().toASCIIString()); + Transformer transformer = templates.newTransformer(); + transformer.setErrorListener(errorHandler); + transformer.transform(inputSource, result); + } else { + if (inputHtml) { + reader = new HtmlParser(XmlViolationPolicy.ALTER_INFOSET); + if (mode == Mode.STREAMING_SAX) { + reader.setProperty("http://validator.nu/properties/streamability-violation-policy", XmlViolationPolicy.FATAL); + } + } + TransformerHandler transformerHandler = transformerFactory.newTransformerHandler(templates); + transformerHandler.setResult(result); + reader.setErrorHandler(errorHandler); + reader.setContentHandler(transformerHandler); + reader.setProperty("http://xml.org/sax/properties/lexical-handler", transformerHandler); + reader.parse(new File(input).toURI().toASCIIString()); + } + outputStream.flush(); + outputStream.close(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5XOM.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5XOM.java new file mode 100644 index 000000000..b364cc521 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XSLT4HTML5XOM.java @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * Copyright (c) 2007 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +import nu.validator.htmlparser.common.XmlViolationPolicy; +import nu.validator.htmlparser.sax.HtmlSerializer; +import nu.validator.htmlparser.xom.HtmlBuilder; +import nu.xom.Builder; +import nu.xom.Document; +import nu.xom.Element; +import nu.xom.Nodes; +import nu.xom.ParsingException; +import nu.xom.Serializer; +import nu.xom.ValidityException; +import nu.xom.converters.SAXConverter; +import nu.xom.xslt.XSLException; +import nu.xom.xslt.XSLTransform; + +import org.xml.sax.SAXException; + +public class XSLT4HTML5XOM { + + private static final String TEMPLATE = "--template="; + + private static final String INPUT_HTML = "--input-html="; + + private static final String INPUT_XML = "--input-xml="; + + private static final String OUTPUT_HTML = "--output-html="; + + private static final String OUTPUT_XML = "--output-xml="; + + /** + * @param args + * @throws IOException + * @throws ParsingException + * @throws ValidityException + * @throws XSLException + * @throws SAXException + */ + public static void main(String[] args) throws ValidityException, + ParsingException, IOException, XSLException, SAXException { + if (args.length == 0) { + System.out.println("--template=file --input-[html|xml]=file --output-[html|xml]=file --mode=[sax-streaming|sax-buffered|dom]"); + System.exit(0); + } + String template = null; + String input = null; + boolean inputHtml = false; + String output = null; + boolean outputHtml = false; + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + if (arg.startsWith(TEMPLATE)) { + if (template == null) { + template = arg.substring(TEMPLATE.length()); + } else { + System.err.println("Tried to set template twice."); + System.exit(1); + } + } else if (arg.startsWith(INPUT_HTML)) { + if (input == null) { + input = arg.substring(INPUT_HTML.length()); + inputHtml = true; + } else { + System.err.println("Tried to set input twice."); + System.exit(2); + } + } else if (arg.startsWith(INPUT_XML)) { + if (input == null) { + input = arg.substring(INPUT_XML.length()); + inputHtml = false; + } else { + System.err.println("Tried to set input twice."); + System.exit(2); + } + } else if (arg.startsWith(OUTPUT_HTML)) { + if (output == null) { + output = arg.substring(OUTPUT_HTML.length()); + outputHtml = true; + } else { + System.err.println("Tried to set output twice."); + System.exit(3); + } + } else if (arg.startsWith(OUTPUT_XML)) { + if (output == null) { + output = arg.substring(OUTPUT_XML.length()); + outputHtml = false; + } else { + System.err.println("Tried to set output twice."); + System.exit(3); + } + } + } + + if (template == null) { + System.err.println("No template specified."); + System.exit(6); + } + if (input == null) { + System.err.println("No input specified."); + System.exit(7); + } + if (output == null) { + System.err.println("No output specified."); + System.exit(8); + } + + Builder builder = new Builder(); + + Document transformationDoc = builder.build(new File(template)); + + XSLTransform transform = new XSLTransform(transformationDoc); + + FileOutputStream outputStream = new FileOutputStream(output); + + Document inputDoc; + if (inputHtml) { + builder = new HtmlBuilder(XmlViolationPolicy.ALTER_INFOSET); + } + inputDoc = builder.build(new File(input)); + Nodes result = transform.transform(inputDoc); + Document outputDoc = new Document((Element) result.get(0)); + if (outputHtml) { + HtmlSerializer htmlSerializer = new HtmlSerializer(outputStream); + SAXConverter converter = new SAXConverter(htmlSerializer); + converter.setLexicalHandler(htmlSerializer); + converter.convert(outputDoc); + } else { + Serializer serializer = new Serializer(outputStream); + serializer.write(outputDoc); + } + outputStream.flush(); + outputStream.close(); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XmlnsDropper.java b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XmlnsDropper.java new file mode 100644 index 000000000..0e6d4b1c2 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/XmlnsDropper.java @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.htmlparser.tools; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +/** + * Quick and dirty hack to work around Xalan xmlns weirdness. + * + * @version $Id$ + * @author hsivonen + */ +class XmlnsDropper implements ContentHandler { + + private final ContentHandler delegate; + + /** + * @param delegate + */ + public XmlnsDropper(final ContentHandler delegate) { + this.delegate = delegate; + } + + /** + * @param ch + * @param start + * @param length + * @throws SAXException + * @see org.xml.sax.ContentHandler#characters(char[], int, int) + */ + public void characters(char[] ch, int start, int length) throws SAXException { + delegate.characters(ch, start, length); + } + + /** + * @throws SAXException + * @see org.xml.sax.ContentHandler#endDocument() + */ + public void endDocument() throws SAXException { + delegate.endDocument(); + } + + /** + * @param uri + * @param localName + * @param qName + * @throws SAXException + * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) + */ + public void endElement(String uri, String localName, String qName) throws SAXException { + delegate.endElement(uri, localName, qName); + } + + /** + * @param prefix + * @throws SAXException + * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String) + */ + public void endPrefixMapping(String prefix) throws SAXException { + delegate.endPrefixMapping(prefix); + } + + /** + * @param ch + * @param start + * @param length + * @throws SAXException + * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int) + */ + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + delegate.ignorableWhitespace(ch, start, length); + } + + /** + * @param target + * @param data + * @throws SAXException + * @see org.xml.sax.ContentHandler#processingInstruction(java.lang.String, java.lang.String) + */ + public void processingInstruction(String target, String data) throws SAXException { + delegate.processingInstruction(target, data); + } + + /** + * @param locator + * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator) + */ + public void setDocumentLocator(Locator locator) { + delegate.setDocumentLocator(locator); + } + + /** + * @param name + * @throws SAXException + * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String) + */ + public void skippedEntity(String name) throws SAXException { + delegate.skippedEntity(name); + } + + /** + * @throws SAXException + * @see org.xml.sax.ContentHandler#startDocument() + */ + public void startDocument() throws SAXException { + delegate.startDocument(); + } + + /** + * @param uri + * @param localName + * @param qName + * @param atts + * @throws SAXException + * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) + */ + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + AttributesImpl ai = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + String u = atts.getURI(i); + String t = atts.getType(i); + String v = atts.getValue(i); + String n = atts.getLocalName(i); + String q = atts.getQName(i); + if (q != null) { + if ("xmlns".equals(q) || q.startsWith("xmlns:")) { + continue; + } + } + ai.addAttribute(u, n, q, t, v); + } + delegate.startElement(uri, localName, qName, ai); + } + + /** + * @param prefix + * @param uri + * @throws SAXException + * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String, java.lang.String) + */ + public void startPrefixMapping(String prefix, String uri) throws SAXException { + delegate.startPrefixMapping(prefix, uri); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/package.html b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/package.html new file mode 100644 index 000000000..a04bf3cd0 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/htmlparser/tools/package.html @@ -0,0 +1,29 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<html> +<head><title>Package Overview</title> +<!-- + Copyright (c) 2007 Henri Sivonen + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. +--> +</head> +<body bgcolor="white"> +<p>Demo apps.</p> +</body> +</html>
\ No newline at end of file diff --git a/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/PassThruPrinter.java b/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/PassThruPrinter.java new file mode 100644 index 000000000..df391d4b4 --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/PassThruPrinter.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2007 Henri Sivonen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.saxtree.test; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParserFactory; + +import nu.validator.htmlparser.sax.XmlSerializer; +import nu.validator.saxtree.Node; +import nu.validator.saxtree.TreeBuilder; +import nu.validator.saxtree.TreeParser; + +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; +import org.xml.sax.ext.LexicalHandler; + +public class PassThruPrinter { + public static void main(String[] args) throws SAXException, IOException, ParserConfigurationException { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + factory.setValidating(false); + XMLReader reader = factory.newSAXParser().getXMLReader(); + + TreeBuilder treeBuilder = new TreeBuilder(); + reader.setContentHandler(treeBuilder); + reader.setProperty("http://xml.org/sax/properties/lexical-handler", treeBuilder); + + File file = new File(args[0]); + InputSource is = new InputSource(new FileInputStream(file)); + is.setSystemId(file.toURI().toASCIIString()); + reader.parse(is); + + Node doc = treeBuilder.getRoot(); + + ContentHandler xmlSerializer = new XmlSerializer(System.out); + + TreeParser treeParser = new TreeParser(xmlSerializer, (LexicalHandler) xmlSerializer); + treeParser.parse(doc); + } + +} diff --git a/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/package.html b/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/package.html new file mode 100644 index 000000000..57809b84e --- /dev/null +++ b/parser/html/java/htmlparser/test-src/nu/validator/saxtree/test/package.html @@ -0,0 +1,29 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"> +<html> +<head><title>Package Overview</title> +<!-- + Copyright (c) 2007 Henri Sivonen + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. +--> +</head> +<body bgcolor="white"> +<p>Test drivers.</p> +</body> +</html>
\ No newline at end of file |