summaryrefslogtreecommitdiffstats
path: root/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java
diff options
context:
space:
mode:
Diffstat (limited to 'parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java')
-rw-r--r--parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java185
1 files changed, 185 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java b/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java
new file mode 100644
index 000000000..de5132151
--- /dev/null
+++ b/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2015 Mozilla Foundation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+package nu.validator.encoding;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CoderResult;
+
+public class Big5Encoder extends Encoder {
+
+ private char utf16Lead = '\u0000';
+
+ private byte pendingTrail = 0;
+
+ protected Big5Encoder(Charset cs) {
+ super(cs, 1.5f, 2.0f);
+ }
+
+ @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
+ assert !((this.reportMalformed || this.reportUnmappable) && (utf16Lead != '\u0000')):
+ "When reporting, this method should never return with utf16Lead set.";
+ if (pendingTrail != 0) {
+ if (!out.hasRemaining()) {
+ return CoderResult.OVERFLOW;
+ }
+ out.put(pendingTrail);
+ pendingTrail = 0;
+ }
+ for (;;) {
+ if (!in.hasRemaining()) {
+ return CoderResult.UNDERFLOW;
+ }
+ if (!out.hasRemaining()) {
+ return CoderResult.OVERFLOW;
+ }
+ boolean isAstral; // true means Plane 2, false means BMP
+ char lowBits; // The low 16 bits of the code point
+ char codeUnit = in.get();
+ int highBits = (codeUnit & 0xFC00);
+ if (highBits == 0xD800) {
+ // high surrogate
+ if (utf16Lead != '\u0000') {
+ // High surrogate follows another high surrogate. The
+ // *previous* code unit is in error.
+ if (this.reportMalformed) {
+ // The caller had better adhere to the API contract.
+ // Otherwise, this may throw.
+ in.position(in.position() - 2);
+ utf16Lead = '\u0000';
+ return CoderResult.malformedForLength(1);
+ }
+ out.put((byte) '?');
+ }
+ utf16Lead = codeUnit;
+ continue;
+ }
+ if (highBits == 0xDC00) {
+ // low surrogate
+ if (utf16Lead == '\u0000') {
+ // Got low surrogate without a previous high surrogate
+ if (this.reportMalformed) {
+ in.position(in.position() - 1);
+ return CoderResult.malformedForLength(1);
+ }
+ out.put((byte) '?');
+ continue;
+ }
+ int codePoint = (utf16Lead << 10) + codeUnit - 56613888;
+ utf16Lead = '\u0000';
+ // Plane 2 is the only astral plane that has potentially
+ // Big5-encodable characters.
+ if ((0xFF0000 & codePoint) != 0x20000) {
+ if (this.reportUnmappable) {
+ in.position(in.position() - 2);
+ return CoderResult.unmappableForLength(2);
+ }
+ out.put((byte) '?');
+ continue;
+ }
+ isAstral = true;
+ lowBits = (char)(codePoint & 0xFFFF);
+ } else {
+ // not a surrogate
+ if (utf16Lead != '\u0000') {
+ // Non-surrogate follows a high surrogate. The *previous*
+ // code unit is in error.
+ utf16Lead = '\u0000';
+ if (this.reportMalformed) {
+ // The caller had better adhere to the API contract.
+ // Otherwise, this may throw.
+ in.position(in.position() - 2);
+ return CoderResult.malformedForLength(1);
+ }
+ out.put((byte) '?');
+ // Let's unconsume this code unit and reloop in order to
+ // re-check if the output buffer still has space.
+ in.position(in.position() - 1);
+ continue;
+ }
+ isAstral = false;
+ lowBits = codeUnit;
+ }
+ // isAstral now tells us if we have a Plane 2 or a BMP character.
+ // lowBits tells us the low 16 bits.
+ // After all the above setup to deal with UTF-16, we are now
+ // finally ready to follow the spec.
+ if (!isAstral && lowBits <= 0x7F) {
+ out.put((byte)lowBits);
+ continue;
+ }
+ int pointer = Big5Data.findPointer(lowBits, isAstral);
+ if (pointer == 0) {
+ if (this.reportUnmappable) {
+ if (isAstral) {
+ in.position(in.position() - 2);
+ return CoderResult.unmappableForLength(2);
+ }
+ in.position(in.position() - 1);
+ return CoderResult.unmappableForLength(1);
+ }
+ out.put((byte)'?');
+ continue;
+ }
+ int lead = pointer / 157 + 0x81;
+ int trail = pointer % 157;
+ if (trail < 0x3F) {
+ trail += 0x40;
+ } else {
+ trail += 0x62;
+ }
+ out.put((byte)lead);
+ if (!out.hasRemaining()) {
+ pendingTrail = (byte)trail;
+ return CoderResult.OVERFLOW;
+ }
+ out.put((byte)trail);
+ continue;
+ }
+ }
+
+ @Override protected CoderResult implFlush(ByteBuffer out) {
+ if (pendingTrail != 0) {
+ if (!out.hasRemaining()) {
+ return CoderResult.OVERFLOW;
+ }
+ out.put(pendingTrail);
+ pendingTrail = 0;
+ }
+ if (utf16Lead != '\u0000') {
+ assert !this.reportMalformed: "How come utf16Lead got to be non-zero when decodeLoop() returned in the reporting mode?";
+ if (!out.hasRemaining()) {
+ return CoderResult.OVERFLOW;
+ }
+ out.put((byte)'?');
+ utf16Lead = '\u0000';
+ }
+ return CoderResult.UNDERFLOW;
+ }
+
+ @Override protected void implReset() {
+ utf16Lead = '\u0000';
+ pendingTrail = 0;
+ }
+}