net.sf.jmatchparser.util.charset.UTF8BinaryCharset Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of jMatchParser-charset Show documentation
A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.
The newest version!
/*
 * Copyright (c) 2010 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;

class UTF8BinaryCharset extends Charset {

	static final String SURROGATE_VARIANT = "UTF-8-Binary";
	static final String PUA_VARIANT = "UTF-8-Binary-PUA";
	
	static final int PUA_BASE_CODEPOINT = 0xE900;
	static final int SURROGATE_BASE_CODEPOINT = 0xDC00;
	static final char PUA_ESCAPE_CHARACTER = (char)(PUA_BASE_CODEPOINT + 0x7F);
	static final int MAX_PUA_CODEPOINT = PUA_BASE_CODEPOINT + 0xFF;
	
	private static final Charset UTF_8 = Charset.forName("UTF-8");
	
	private final boolean usePUA;
	
	protected UTF8BinaryCharset(boolean usePUA) {
		super(usePUA ? PUA_VARIANT : SURROGATE_VARIANT, null);
		this.usePUA = usePUA;
	}

	@Override
	public boolean contains(Charset cs) {
		return cs instanceof UTF8BinaryCharset || UTF_8.contains(cs);
	}

	@Override
	public CharsetDecoder newDecoder() {
		return new Decoder();
	}

	@Override
	public CharsetEncoder newEncoder() {
		return new Encoder();
	}
	
	protected static boolean isValidCodepoint(int codePoint) {
		return (codePoint >= Character.MIN_CODE_POINT && codePoint < Character.MIN_HIGH_SURROGATE)
				|| (codePoint > Character.MAX_LOW_SURROGATE && codePoint <= Character.MAX_CODE_POINT);
	}
	
	protected static boolean needEscaping(int codePoint) {
		return codePoint >= PUA_ESCAPE_CHARACTER && codePoint <= MAX_PUA_CODEPOINT;
	}

	private class Decoder extends CharsetDecoder {

		// it is deliberate that this is one less than 1!
		private static final byte STATE_BUFFER_FULL = 0;
		private static final byte STATE_BUFFER_FULL_INVALID = -10;
		private static final byte STATE_BUFFER_EMPTY = -20;

		private final CharsetDecoder utf8Decoder = UTF_8.newDecoder();
		private final ByteBuffer buffer = ByteBuffer.allocate(4);
		private byte state = STATE_BUFFER_EMPTY;

		protected Decoder() {
			super(UTF8BinaryCharset.this, 1.0f, 1.0f + (usePUA ? 1 : 0));
			utf8Decoder.onMalformedInput(CodingErrorAction.REPORT);
			utf8Decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
		}

		@Override
		protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
			while (true) {
				if (state == STATE_BUFFER_FULL_INVALID) {
					if (!writeReplacements(out))
						return CoderResult.OVERFLOW;
				}
				if (state == STATE_BUFFER_FULL) {
					if (!decodeBuffer(out))
						return CoderResult.UNDERFLOW;
				}
				if (in.remaining() == 0)
					return CoderResult.UNDERFLOW;
				if (out.remaining() == 0)
					return CoderResult.OVERFLOW;
				byte b = in.get();
				if (state == STATE_BUFFER_EMPTY) {
					calculateState(b);
				} else if ((b >> 6) != -2) {
					if (!writeReplacements(out)) {
						in.position(in.position() - 1);
						state = STATE_BUFFER_FULL_INVALID;
						return CoderResult.OVERFLOW;
					}
					calculateState(b);
				}
				buffer.put(b);
				state--;
				if (state == -1) {
					state = STATE_BUFFER_FULL_INVALID;
				}
			}
		}
		
		private boolean decodeBuffer(CharBuffer out) {
			state = STATE_BUFFER_EMPTY;
			ByteBuffer dup = buffer.duplicate();
			dup.flip();
			CharBuffer cb = CharBuffer.allocate(2);
			CoderResult cr = utf8Decoder.decode(dup, cb, false);
			cb.flip();
			if (cr == CoderResult.UNDERFLOW) {
				// ok
			} else if (cr == CoderResult.OVERFLOW) {
				throw new IllegalStateException("UTF8 decoder buffer too small");
			} else if (cr.isMalformed()) {
				cb = null;
				state = STATE_BUFFER_FULL_INVALID;
			} else {
				if(true)throw new IllegalStateException("Unexpected coder result: " + cr.toString());
			}
			if (cb == null) {
				// fall through
			} else if (dup.remaining() == 0) {
				if (cb.remaining() == 1) {
					char c = cb.get();
					if (usePUA && needEscaping(c)) {
						if (out.remaining() < 2) {
							state = STATE_BUFFER_FULL;
							return false;
						}
						out.put(PUA_ESCAPE_CHARACTER);
						out.put(c);
						buffer.clear();
					} else if (isValidCodepoint(c)) {
						out.put(c);
						buffer.clear();
					} else {
						state = STATE_BUFFER_FULL_INVALID;
					}
				} else if (cb.remaining() == 2) {
					char c1 = cb.get();
					char c2 = cb.get();
					if (Character.isSurrogatePair(c1, c2) && isValidCodepoint(Character.toCodePoint(c1, c2))) {
						out.put(c1);
						out.put(c2);
						buffer.clear();
					} else {
						state = STATE_BUFFER_FULL_INVALID;
					}
				} else {
					throw new IllegalStateException("Buffer size is "+cb.remaining());
				}
			} else {
				throw new IllegalStateException("Remaining size is "+dup.remaining());
			}
			if (state == STATE_BUFFER_FULL_INVALID) {
				if (!writeReplacements(out)) {
					return false;
				}
			}
			return true;
		}
 
		private boolean writeReplacements(CharBuffer out) {
			if (out.remaining() < buffer.position()) {
				return false;
			}
			buffer.flip();
			while(buffer.remaining() > 0) {
				out.put((char)((usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT) + (				buffer.get() & 0xFF))); 

			}
			buffer.clear();
			state = STATE_BUFFER_EMPTY;
			return true;
		}

		private void calculateState(byte b) {
			if (b >= 0) state = 1;
			else if ((b >> 5) == -2) state = 2;
			 else if ((b >> 4) == -2) state = 3;
			 else if ((b >> 3) == -2) state = 4;
			 else state = STATE_BUFFER_FULL_INVALID + 1;
		}

		@Override
		protected CoderResult implFlush(CharBuffer out) {
			if (!writeReplacements(out))
				return CoderResult.OVERFLOW;
			return super.implFlush(out);
		}

		@Override
		protected void implReset() {
			buffer.clear();
			state = STATE_BUFFER_EMPTY;
		}
	}
	
	class Encoder extends CharsetEncoder {

		private CharsetEncoder utf8Encoder = UTF_8.newEncoder();
		
		protected Encoder() {
			super(UTF8BinaryCharset.this, 1.1f, 4.0f);
		}

		@Override
		protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
			CharBuffer buf = CharBuffer.allocate(2);
			int baseCodepoint = (usePUA ? PUA_BASE_CODEPOINT : SURROGATE_BASE_CODEPOINT);
			while (in.remaining() > 0) {
				char c = in.charAt(0);
				boolean escaped = false;
				if (usePUA && c == PUA_ESCAPE_CHARACTER) {
					if (in.remaining() < 2)
						return CoderResult.UNDERFLOW;
					c = in.charAt(1);
					if (c == PUA_ESCAPE_CHARACTER) {
						in.get();
						escaped = true;
						// fall through
					} else if (c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) {
						in.get();
						escaped = true;
						// fall through
					} else {
						return CoderResult.unmappableForLength(1);
					}					
				}
				if (!escaped && c >= baseCodepoint + 0x80 && c <= baseCodepoint + 0xFF) {
					if (out.remaining() == 0)
						return CoderResult.OVERFLOW;
					in.get();
					out.put((byte)(c-baseCodepoint));
				} else {
					buf.clear();
					buf.put(c);
					boolean surrogatePair = false;
					if (Character.isHighSurrogate(c)) {
						if (in.remaining() == 1)
							return CoderResult.UNDERFLOW;
						char c2 = in.charAt(1);
						if (Character.isLowSurrogate(c2)) {
							surrogatePair = true;
							buf.put(c2);
						}
					}
					buf.flip();
					CoderResult cr = utf8Encoder.encode(buf, out, false);
					if (cr != CoderResult.UNDERFLOW)
						return cr;
					if (buf.remaining() > 0) {
						if (surrogatePair && buf.remaining() == 1)
							in.get();
						return CoderResult.unmappableForLength(buf.remaining());
					}
					in.get();
					if (surrogatePair)
						in.get();
				}				
			}
			return CoderResult.UNDERFLOW;
		}
	}
}