All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.encoding.Big5Decoder Maven / Gradle / Ivy

Go to download

The Validator.nu HTML Parser is an implementation of the HTML5 parsing algorithm in Java for applications. The parser is designed to work as a drop-in replacement for the XML parser in applications that already support XHTML 1.x content with an XML parser and use SAX, DOM or XOM to interface with the parser.

There is a newer version: 1.4.16
Show newest version
/*
 * Copyright (c) 2015 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.encoding;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CoderResult;

public class Big5Decoder extends Decoder {
   
    private int big5Lead = 0;
    
    private char pendingTrail = '\u0000';
    
    protected Big5Decoder(Charset cs) {
        super(cs, 0.5f, 1.0f);
    }

    @Override protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
        assert !(this.report && (big5Lead != 0)):
            "When reporting, this method should never return with big5Lead set.";
        if (pendingTrail != '\u0000') {
            if (!out.hasRemaining()) {
                return CoderResult.OVERFLOW;
            }
            out.put(pendingTrail);
            pendingTrail = '\u0000';
        }
        for (;;) {
            if (!in.hasRemaining()) {
                return CoderResult.UNDERFLOW;
            }
            if (!out.hasRemaining()) {
                return CoderResult.OVERFLOW;
            }
            int b = ((int) in.get() & 0xFF);
            if (big5Lead == 0) {
                if (b <= 0x7F) {
                    out.put((char) b);
                    continue;
                }
                if (b >= 0x81 && b <= 0xFE) {
                    if (this.report && !in.hasRemaining()) {
                        // The Java API is badly documented. Need to do this
                        // crazy thing and hope the caller knows about the
                        // undocumented aspects of the API!
                        in.position(in.position() - 1);
                        return CoderResult.UNDERFLOW;
                    }
                    big5Lead = b;
                    continue;
                }
                if (this.report) {
                    in.position(in.position() - 1);
                    return CoderResult.malformedForLength(1);
                }
                out.put('\uFFFD');
                continue;
            }
            int lead = big5Lead;
            big5Lead = 0;
            int offset = (b < 0x7F) ? 0x40 : 0x62;
            if ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE)) {
                int pointer = (lead - 0x81) * 157 + (b - offset);
                char outTrail;
                switch (pointer) {
                    case 1133:
                        out.put('\u00CA');
                        outTrail = '\u0304';
                        break;
                    case 1135:
                        out.put('\u00CA');
                        outTrail = '\u030C';
                        break;
                    case 1164:
                        out.put('\u00EA');
                        outTrail = '\u0304';
                        break;
                    case 1166:
                        out.put('\u00EA');
                        outTrail = '\u030C';
                        break;
                    default:
                        char lowBits = Big5Data.lowBits(pointer);
                        if (lowBits == '\u0000') {
                            // The following |if| block fixes
                            // https://github.com/whatwg/encoding/issues/5
                            if (b <= 0x7F) {
                                // prepend byte to stream
                                // Always legal, since we've always just read a byte
                                // if we come here.
                                in.position(in.position() - 1);
                            }
                            if (this.report) {
                                // This can go past the start of the buffer
                                // if the caller does not conform to the
                                // undocumented aspects of the API.
                                in.position(in.position() - 1);
                                return CoderResult.malformedForLength(b <= 0x7F ? 1 : 2);
                            }
                            out.put('\uFFFD');
                            continue;
                        }
                        if (Big5Data.isAstral(pointer)) {
                            int codePoint = lowBits | 0x20000;
                            out.put((char) (0xD7C0 + (codePoint >> 10)));
                            outTrail = (char) (0xDC00 + (codePoint & 0x3FF));
                            break;
                        }
                        out.put(lowBits);
                        continue;
                }
                if (!out.hasRemaining()) {
                    pendingTrail = outTrail;
                    return CoderResult.OVERFLOW;
                }
                out.put(outTrail);
                continue;
            }
            // pointer is null
            if (b <= 0x7F) {
                // prepend byte to stream
                // Always legal, since we've always just read a byte
                // if we come here.
                in.position(in.position() - 1);
            }
            if (this.report) {
                // if position() == 0, the caller is not using the
                // undocumented part of the API right and the line
                // below will throw!
                in.position(in.position() - 1);
                return CoderResult.malformedForLength(b <= 0x7F ? 1 : 2);
            }
            out.put('\uFFFD');
            continue;
        }
    }

    @Override protected CoderResult implFlush(CharBuffer out) {
        if (pendingTrail != '\u0000') {
            if (!out.hasRemaining()) {
                return CoderResult.OVERFLOW;
            }
            out.put(pendingTrail);
            pendingTrail = '\u0000';
        }
        if (big5Lead != 0) {
            assert !this.report: "How come big5Lead got to be non-zero when decodeLoop() returned in the reporting mode?";
            if (!out.hasRemaining()) {
                return CoderResult.OVERFLOW;
            }
            out.put('\uFFFD');
            big5Lead = 0;            
        }
        return CoderResult.UNDERFLOW;
    }

    @Override protected void implReset() {
        big5Lead = 0;
        pendingTrail = '\u0000';
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy