All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.amygdalum.util.text.ByteEncoding Maven / Gradle / Ivy

The newest version!
package net.amygdalum.util.text;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CodingErrorAction;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import net.amygdalum.util.builders.Lists;
import net.amygdalum.util.builders.Maps;

public final class ByteEncoding {

	private static final Map> partitionings = initPartitionings();

	private static Map> initPartitionings() {
		return Maps.> hashed()
			.put(UTF_8, Lists. list()
				.add(new ByteRange((byte) 0b0, (byte) 0b0111_1111, 128))
				.add(new ByteRange(new byte[]{(byte) 0b1100_0010, (byte) 0b1000_0000}, new byte[]{(byte) 0b1101_1111, (byte) 0b1011_1111}, 1920))
				.add(new ByteRange(new byte[]{(byte) 0b1110_0000, (byte) 0b1010_0000, (byte) 0b1000_0000}, new byte[]{(byte) 0b1110_1111, (byte) 0b1011_1111, (byte) 0b1011_1111}, 63488))
				.build())
			.build();
	}


	public static List getPartitioningFor(Charset charset) {
		List part = partitionings.get(charset);
		if (part == null) {
			part = bruteForce();
			partitionings.put(charset, part);
		}
		return part;
	}

	private static List bruteForce() {
		List ranges = new ArrayList<>();
		byte[] start = null;
		int size = 0;
		byte[] last = null;
		for (int i = Character.MIN_VALUE; i <= Character.MAX_VALUE; i++) {
			byte[] current = encode((char) i);
			if (start == null) {
				start = current;
				last = current;
				size = 1;
			} else if (start.length == current.length) {
				last = current;
				size++;
			} else {
				ranges.add(new ByteRange(start, last, size));
				start = current;
				last = current;
				size = 1;
			}
			
		}
		return ranges;
	}

	public static byte[] encode(String pattern) {
		return encode(pattern, UTF_8);
	}

	public static byte[] encode(String pattern, Charset charset) {
		try {
			CharsetEncoder encoder = charset.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);
			ByteBuffer buffer = encoder.encode(CharBuffer.wrap(pattern));
			byte[] encoded = new byte[buffer.limit()];
			buffer.get(encoded);
			return encoded;
		} catch (CharacterCodingException e) {
			return new byte[0];
		}
	}

	public static byte[] encode(char pattern) {
		return encode(UTF_8, pattern);
	}

	public static byte[] encode(Charset charset, char pattern) {
		try {
			CharsetEncoder encoder = charset.newEncoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);
			ByteBuffer buffer = encoder.encode(CharBuffer.wrap(new char[] { pattern }));
			byte[] encoded = new byte[buffer.limit()];
			buffer.get(encoded);
			return encoded;
		} catch (CharacterCodingException e) {
			return new byte[0];
		}
	}

	public static String decode(byte... pattern) {
		return decode(UTF_8, pattern);
	}

	public static String decode(Charset charset, byte ... pattern) {
		try {
			CharsetDecoder decoder = charset.newDecoder()
				.onMalformedInput(CodingErrorAction.REPORT)
				.onUnmappableCharacter(CodingErrorAction.REPORT);
			CharBuffer buffer = decoder.decode(ByteBuffer.wrap(pattern));
			return buffer.toString();
		} catch (CharacterCodingException e) {
			return "";
		}
	}

	public static List intervals(Charset charset, char from, char to) {
		byte[] bytesFrom = encode(charset, from);
		byte[] bytesTo = encode(charset, to);

		List partitioning = getPartitioningFor(charset);

		List intervals = new ArrayList<>(partitioning.size());
		char base = 0;
		for (ByteRange part : partitioning) {
			if (from >= base && from < base + part.size() && to >= base && to < base + part.size()) {
				intervals.add(new ByteRange(bytesFrom, bytesTo, (int) to - from));
			} else if (from >= base && from < base + part.size()) {
				intervals.add(new ByteRange(bytesFrom, part.to, (int) base + part.size() - from));
			} else if (from < base && to >= base + part.size()) {
				intervals.add(part);
			} else if (to >= base && to < base + part.size()) {
				intervals.add(new ByteRange(part.from, bytesTo, (int) to - base));
			}
			base += part.size();
		}
		return intervals;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy