diff options
Diffstat (limited to 'parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java')
-rw-r--r-- | parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java b/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java new file mode 100644 index 000000000..de5132151 --- /dev/null +++ b/parser/html/java/htmlparser/src/nu/validator/encoding/Big5Encoder.java @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2015 Mozilla Foundation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +package nu.validator.encoding; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CoderResult; + +public class Big5Encoder extends Encoder { + + private char utf16Lead = '\u0000'; + + private byte pendingTrail = 0; + + protected Big5Encoder(Charset cs) { + super(cs, 1.5f, 2.0f); + } + + @Override protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) { + assert !((this.reportMalformed || this.reportUnmappable) && (utf16Lead != '\u0000')): + "When reporting, this method should never return with utf16Lead set."; + if (pendingTrail != 0) { + if (!out.hasRemaining()) { + return CoderResult.OVERFLOW; + } + out.put(pendingTrail); + pendingTrail = 0; + } + for (;;) { + if (!in.hasRemaining()) { + return CoderResult.UNDERFLOW; + } + if (!out.hasRemaining()) { + return CoderResult.OVERFLOW; + } + boolean isAstral; // true means Plane 2, false means BMP + char lowBits; // The low 16 bits of the code point + char codeUnit = in.get(); + int highBits = (codeUnit & 0xFC00); + if (highBits == 0xD800) { + // high surrogate + if (utf16Lead != '\u0000') { + // High surrogate follows another high surrogate. The + // *previous* code unit is in error. + if (this.reportMalformed) { + // The caller had better adhere to the API contract. + // Otherwise, this may throw. + in.position(in.position() - 2); + utf16Lead = '\u0000'; + return CoderResult.malformedForLength(1); + } + out.put((byte) '?'); + } + utf16Lead = codeUnit; + continue; + } + if (highBits == 0xDC00) { + // low surrogate + if (utf16Lead == '\u0000') { + // Got low surrogate without a previous high surrogate + if (this.reportMalformed) { + in.position(in.position() - 1); + return CoderResult.malformedForLength(1); + } + out.put((byte) '?'); + continue; + } + int codePoint = (utf16Lead << 10) + codeUnit - 56613888; + utf16Lead = '\u0000'; + // Plane 2 is the only astral plane that has potentially + // Big5-encodable characters. + if ((0xFF0000 & codePoint) != 0x20000) { + if (this.reportUnmappable) { + in.position(in.position() - 2); + return CoderResult.unmappableForLength(2); + } + out.put((byte) '?'); + continue; + } + isAstral = true; + lowBits = (char)(codePoint & 0xFFFF); + } else { + // not a surrogate + if (utf16Lead != '\u0000') { + // Non-surrogate follows a high surrogate. The *previous* + // code unit is in error. + utf16Lead = '\u0000'; + if (this.reportMalformed) { + // The caller had better adhere to the API contract. + // Otherwise, this may throw. + in.position(in.position() - 2); + return CoderResult.malformedForLength(1); + } + out.put((byte) '?'); + // Let's unconsume this code unit and reloop in order to + // re-check if the output buffer still has space. + in.position(in.position() - 1); + continue; + } + isAstral = false; + lowBits = codeUnit; + } + // isAstral now tells us if we have a Plane 2 or a BMP character. + // lowBits tells us the low 16 bits. + // After all the above setup to deal with UTF-16, we are now + // finally ready to follow the spec. + if (!isAstral && lowBits <= 0x7F) { + out.put((byte)lowBits); + continue; + } + int pointer = Big5Data.findPointer(lowBits, isAstral); + if (pointer == 0) { + if (this.reportUnmappable) { + if (isAstral) { + in.position(in.position() - 2); + return CoderResult.unmappableForLength(2); + } + in.position(in.position() - 1); + return CoderResult.unmappableForLength(1); + } + out.put((byte)'?'); + continue; + } + int lead = pointer / 157 + 0x81; + int trail = pointer % 157; + if (trail < 0x3F) { + trail += 0x40; + } else { + trail += 0x62; + } + out.put((byte)lead); + if (!out.hasRemaining()) { + pendingTrail = (byte)trail; + return CoderResult.OVERFLOW; + } + out.put((byte)trail); + continue; + } + } + + @Override protected CoderResult implFlush(ByteBuffer out) { + if (pendingTrail != 0) { + if (!out.hasRemaining()) { + return CoderResult.OVERFLOW; + } + out.put(pendingTrail); + pendingTrail = 0; + } + if (utf16Lead != '\u0000') { + assert !this.reportMalformed: "How come utf16Lead got to be non-zero when decodeLoop() returned in the reporting mode?"; + if (!out.hasRemaining()) { + return CoderResult.OVERFLOW; + } + out.put((byte)'?'); + utf16Lead = '\u0000'; + } + return CoderResult.UNDERFLOW; + } + + @Override protected void implReset() { + utf16Lead = '\u0000'; + pendingTrail = 0; + } +} |