All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.jmatchparser.util.charset.Table8BitCharset Maven / Gradle / Ivy

Go to download

A java-based parser for parsing/grabbing web sites and other text or XML documents, based on a nondeterministic parser language, creating XML output. Also contains a few utility classes for HTML, CSV and text parsing, and additional character sets. The jMatchParser-charset module contains the character sets.

The newest version!
/*
 * Copyright (c) 2009 - 2011 Michael Schierl
 * 
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 
 * - Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *   
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 *   
 * - Neither name of the copyright holders nor the names of its
 *   contributors may be used to endorse or promote products derived from
 *   this software without specific prior written permission.
 *   
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND THE CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package net.sf.jmatchparser.util.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

class Table8BitCharset extends Charset {

	protected final String mapping;
	private Map encoderMap = null;
	private float avgBytesPerChar = 0, maxBytesPerChar = 0;
	private byte[] replacement = null;

	protected Table8BitCharset(String canonicalName, String[] aliases, String mapping) {
		super(canonicalName, aliases);
		this.mapping = mapping;
	}

	@Override
	public boolean contains(Charset cs) {
		return cs == this;
	}

	@Override
	public CharsetDecoder newDecoder() {
		return new Decoder();
	}

	@Override
	public CharsetEncoder newEncoder() {
		if (encoderMap == null) {
			encoderMap = new HashMap();
			for (int i = 0; i < 256; i++) {
				char c = mapping.charAt(i);
				if (c != '\uFFFD') {
					encoderMap.put(c, new byte[] { (byte) i });
				}
			}
			addExtraItemsToEncoderMap(encoderMap);
			int sumLength = 0, count = 0;
			for (byte[] value : encoderMap.values()) {
				if (value.length > maxBytesPerChar)
					maxBytesPerChar = value.length;
				count++;
				sumLength += value.length;
			}
			avgBytesPerChar = sumLength / (float) count;
			replacement = findReplacement();
		}
		return new MapBasedEncoder(this, encoderMap, avgBytesPerChar, maxBytesPerChar, replacement);
	}

	private byte[] findReplacement() {
		byte[] replacement = new byte[] { (byte) '?' };
		if (Arrays.equals(encoderMap.get('?'), replacement))
			return replacement;
		for (byte[] value : encoderMap.values()) {
			if (Arrays.equals(value, replacement))
				return replacement;
		}
		replacement = encoderMap.get('?');
		if (replacement == null) {
			for (byte[] value : encoderMap.values()) {
				if (replacement == null)
					replacement = value;
				else if (value.length < replacement.length)
					replacement = value;
				else if (value.length == replacement.length && value[0] > replacement[0])
					replacement = value;
			}
		}
		return replacement;
	}

	protected void addExtraItemsToEncoderMap(Map encoderMap) {
	}

	private class Decoder extends CharsetDecoder {

		protected Decoder() {
			super(Table8BitCharset.this, 1, 1);
		}

		@Override
		protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
			while (true) {
				if (in.remaining() == 0)
					return CoderResult.UNDERFLOW;
				if (out.remaining() == 0)
					return CoderResult.OVERFLOW;
				char c = mapping.charAt(in.get() & 0xFF);
				if (c == '\uFFFD') {
					in.position(in.position() - 1);
					return CoderResult.unmappableForLength(1);
				}
				out.put(c);
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy