All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.UTFBOMCharset Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/*
 * Copyright (c) 2010 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Set;

class UTFBOMCharset extends Charset {

	public static final String PREFIX = "UTF-BOM.";
	private final Charset fallback;

	protected UTFBOMCharset(Charset fallback) {
		super(PREFIX + fallback.name(), buildAliases(fallback.aliases()));
		this.fallback = fallback;
	}

	private static String[] buildAliases(Set aliases) {
		String[] result = new String[aliases.size()];
		int i = 0;
		for (String alias : aliases) {
			result[i] = PREFIX + alias;
			i++;
		}
		return result;
	}

	@Override
	public boolean contains(Charset cs) {
		return cs.name() == this.name() || fallback.contains(cs);
	}

	@Override
	public CharsetDecoder newDecoder() {
		return new Decoder(fallback.newDecoder());
	}

	@Override
	public CharsetEncoder newEncoder() {
		throw new UnsupportedOperationException();
	}

	@Override
	public boolean canEncode() {
		return false;
	}

	private class Decoder extends CharsetDecoder {

		private final CharsetDecoder fallbackDecoder;
		private CharsetDecoder usedDecoder = null;
		private byte state = 0;

		protected Decoder(CharsetDecoder fallbackDecoder) {
			super(UTFBOMCharset.this, fallbackDecoder.averageCharsPerByte(), fallbackDecoder.maxCharsPerByte() + 3);
			this.fallbackDecoder = fallbackDecoder;
		}

		@Override
		protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
			while (true) {
				if (usedDecoder != null) {
					if (state != 0) {
						CoderResult result = writeOut(out);
						if (!result.isUnderflow())
							return result;
					}
					return usedDecoder.decode(in, out, false);
				}
				if (in.remaining() == 0)
					return CoderResult.UNDERFLOW;
				if (out.remaining() == 0)
					return CoderResult.OVERFLOW;
				byte b = in.get();
				if (state == 0 && (b == (byte) 0xFE || b == (byte) 0xFF || b == (byte) 0xEF)) {
					state = b;
				} else if (state == (byte) 0xEF && b == (byte) 0xBB) {
					state = b;
				} else if (state == (byte) 0xFE && b == (byte) 0xFF) {
					state = 0;
					usedDecoder = Charset.forName("UTF-16BE").newDecoder();
					usedDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
					usedDecoder.onMalformedInput(CodingErrorAction.REPORT);
				} else if (state == (byte) 0xFF && b == (byte) 0xFE) {
					state = 0;
					usedDecoder = Charset.forName("UTF-16LE").newDecoder();
					usedDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
					usedDecoder.onMalformedInput(CodingErrorAction.REPORT);
				} else if (state == (byte) 0xBB && b == (byte) 0xBF) {
					state = 0;
					usedDecoder = Charset.forName("UTF-8").newDecoder();
					usedDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
					usedDecoder.onMalformedInput(CodingErrorAction.REPORT);
				} else {
					in.position(in.position() - 1);
					usedDecoder = fallbackDecoder;
					usedDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
					usedDecoder.onMalformedInput(CodingErrorAction.REPORT);
				}
			}
		}

		@Override
		protected CoderResult implFlush(CharBuffer out) {
			if (state != 0) {
				if (usedDecoder == null)
					usedDecoder = fallbackDecoder;
				CoderResult result = writeOut(out);
				if (!result.isUnderflow())
					return result;
			}
			if (usedDecoder != null) {
				ByteBuffer empty = ByteBuffer.allocate(1);
				empty.flip();
				CoderResult result = usedDecoder.decode(empty, out, true);
				if (!result.isUnderflow())
					return result;
				result = usedDecoder.flush(out);
				if (!result.isUnderflow())
					return result;
				usedDecoder = null;
			}
			return super.implFlush(out);
		}

		private CoderResult writeOut(CharBuffer out) {
			ByteBuffer in = ByteBuffer.allocate(2);
			if (state == (byte) 0xBB) {
				in.put((byte) 0xEF);
			}
			in.put(state);
			in.flip();

			CoderResult result = usedDecoder.decode(in, out, false);
			if (in.remaining() == 0)
				state = 0;
			if (in.remaining() == 1 && state == (byte) 0xBB)
				state = (byte) 0xEF;
			return result;
		}

		@Override
		protected void implReset() {
			usedDecoder = null;
			state = 0;
			fallbackDecoder.reset();
		}

		@Override
		public boolean isAutoDetecting() {
			return true;
		}

		@Override
		public boolean isCharsetDetected() {
			return usedDecoder != null;
		}

		@Override
		public Charset detectedCharset() {
			return usedDecoder.charset();
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy