All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.CombinationCharset Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/*
 * Copyright (c) 2010 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.util.Map;

class CombinationCharset extends Table8BitCharset {

	private final String[] combinations;
	private char[][] combinationsMap = null;
	private float avgCharsPerByte = 0;

	protected CombinationCharset(String canonicalName, String[] aliases, String mapping, String... combinations) {
		super(canonicalName, aliases, mapping);
		this.combinations = combinations;
	}

	@Override
	public CharsetDecoder newDecoder() {
		if (combinationsMap == null) {
			combinationsMap = new char[256][];
			int combinationCount = 0;
			for (String combination : combinations) {
				int first = combination.charAt(0);
				combinationsMap[first] = combination.substring(1).toCharArray();
				for (int i = 1; i < combination.length(); i++) {
					if (combination.charAt(i) != '\uFFFD') {
						combinationCount++;
					}
				}
			}
			avgCharsPerByte = (combinationCount * 1 + (65536 - combinationCount) * 2) / 65536f / 2f;
		}
		return new Decoder();
	}

	@Override
	protected void addExtraItemsToEncoderMap(Map encoderMap) {
		for (String combination : combinations) {
			byte first = (byte) combination.charAt(0);
			for (int i = 1; i < combination.length(); i++) {
				char c = combination.charAt(i);
				if (c != '\uFFFD') {
					encoderMap.put(c, new byte[] { first, (byte) (i - 1) });
				}
			}
		}
	}

	private class Decoder extends CharsetDecoder {

		private int lastByte = -1;

		protected Decoder() {
			super(CombinationCharset.this, avgCharsPerByte, 2);
		}

		@Override
		protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
			while (true) {
				if (in.remaining() == 0)
					return CoderResult.UNDERFLOW;
				if (out.remaining() == 0)
					return CoderResult.OVERFLOW;
				int b = in.get() & 0xFF;
				if (lastByte != -1) {
					char c = combinationsMap[lastByte][b];
					if (c == '\uFFFD') {
						// un-read the last character
						in.position(in.position() - 1);
						out.put(mapping.charAt(lastByte));
					} else {
						out.put(c);
					}
					lastByte = -1;
				} else if (combinationsMap[b] != null) {
					lastByte = b;
				} else {
					char c = mapping.charAt(b);
					if (c == '\uFFFD') {
						in.position(in.position() - 1);
						return CoderResult.unmappableForLength(1);
					}
					out.put(c);
				}
			}
		}

		@Override
		protected CoderResult implFlush(CharBuffer out) {
			if (lastByte != -1) {
				if (out.remaining() == 0)
					return CoderResult.OVERFLOW;
				out.put(mapping.charAt(lastByte));
				lastByte = -1;
			}
			return super.implFlush(out);
		}

		@Override
		protected void implReset() {
			lastByte = -1;
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy