All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cobber.fta.core.Utils Maven / Gradle / Ivy

Go to download

Analyze Text data to determine simple type and Semantic type information as well as other key metrics associated with a text stream.

There is a newer version: 15.7.14
Show newest version
/*
 * Copyright 2017-2024 Tim Segall
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cobber.fta.core;

import java.io.IOException;
import java.io.StringReader;
import java.security.SecureRandom;
import java.text.NumberFormat;
import java.text.ParseException;
import java.text.ParsePosition;
import java.util.Base64;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.DocumentBuilderFactory;

import org.xml.sax.InputSource;

import com.fasterxml.jackson.databind.ObjectMapper;

/**
 * A Utility class with a set of helper functions.
 */
public final class Utils {

	private Utils() {
		// Never called
	}

	public static > Map sortByValue(final Map map) {
		return map.entrySet().stream().sorted(Map.Entry.comparingByValue(Comparator.reverseOrder()))
				.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e1, LinkedHashMap::new));
	}

	public static String replaceFirst(final String input, final String oldString, final String newString) {
		final int index = input.indexOf(oldString);
		if (index == -1)
			return input;

		return input.substring(0, index)
				.concat(newString)
				.concat(input.substring(index + oldString.length()));
	}

	public static String replaceLast(final String input, final String oldString, final String newString) {
		final int index = input.lastIndexOf(oldString);
		if (index == -1)
			return input;

		return input.substring(0, index)
				.concat(newString)
				.concat(input.substring(index + oldString.length()));
	}

	public static String replaceAt(final String input, final int offset, final int length, final String newString) {
		return input.substring(0, offset)
				.concat(newString)
				.concat(input.substring(offset + length));
	}

	/**
	 * Return a String with the provided character repeated <count> times;.
	 * @param c The Character to repeat
	 * @param count The number of time to repeat the character.
	 * @return The String with <count> occurrences of the supplied character.
	 */
	public static String repeat(final char c, final int count) {
		if (count == 0)
			return "";

		if (count == 1)
			return String.valueOf(c);

		if (count == 2)
			return String.valueOf(new char[] {c, c});

		final StringBuilder s = new StringBuilder(count);
		for (int i = 0; i < count; i++)
			s.append(c);
		return s.toString();
	}

	/**
	 * Given a String as input with an offset and length return the integer at that position.
	 * @param input String to extract integer from
	 * @param offset Integer offset that marks the start
	 * @param minLength minimum length of integer to be extracted.
	 * @param maxLength maximum length of integer to be extracted.
	 * @return An integer value from the supplied String.
	 */
	public static int getValue(final String input, final int offset, final int minLength, final int maxLength) {
		try {
			if (minLength == maxLength || (offset + maxLength > input.length()) || !Character.isDigit(input.charAt(offset + maxLength - 1)))
				return Integer.parseInt(input.substring(offset, offset + minLength));

			return Integer.parseInt(input.substring(offset, offset + maxLength));
		}
		catch (NumberFormatException e) {
			return -1;
		}
	}

	/**
	 * Test if the supplied input is a string of all 0's.
	 * @param input String to test.
	 * @return True if input is a string of 0's.
	 */
	public static boolean allZeroes(final String input) {
		if (input == null || input.isEmpty())
			return false;

		for (int i = 0; i < input.length(); i++)
			if (input.charAt(i) != '0')
				return false;

		return true;
	}

	/**
	 * Test if the supplied input is all numeric.
	 * @param input String to test.
	 * @return True if the string is all Numeric.
	 */
	public static boolean isNumeric(final String input) {
		if (input == null || input.isEmpty())
			return false;

		return input.chars().allMatch(Character::isDigit);
	}

	/**
	 * Test if the supplied input is all alphas.
	 * @param input String to test.
	 * @return True if the string is all alphas.
	 */
	public static boolean isAlphas(final String input) {
		if (input == null || input.isEmpty())
			return false;

		return input.chars().allMatch(Character::isLetter);
	}

	/**
	 * Test if the supplied character is numeric [0-9].
	 * @param ch Character to test.
	 * @return True if the character is in the range [0-9].
	 */
	public static boolean isSimpleNumeric(final char ch) {
		return ch >= '0' && ch <= '9';
	}

	/**
	 * Test if the supplied character is alpha [A-Za-z].
	 * @param ch Character to test.
	 * @return True if the character is in the range [A-Za-z].
	 */
	public static boolean isSimpleAlpha(final char ch) {
		return ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z';
	}

	/**
	 * Test if the supplied character is alphaNumeric [A-Za-z0-9].
	 * @param ch Character to test.
	 * @return True if the character is in the range [A-Za-z0-9].
	 */
	public static boolean isSimpleAlphaNumeric(final char ch) {
		return ch >= '0' && ch <= '9' || ch >= 'A' && ch <= 'Z' || ch >= 'a' && ch <= 'z';
	}

	public static String getBaseName(final String fileName) {
		final int index = fileName.lastIndexOf('.');
		return index == -1 ? fileName : fileName.substring(0, index);
	}

	/**
	 * Clean a string.
	 * Replacing evil characters:
	 *  - LEFT and RIGHT SINGLE QUOTATION MARK and backticks - with a standard quote.
	 *  - LEFT and RIGHT DOUBLE QUOTATION MARK
	 *  - en-dash and em-dash with a simple hyphen.
	 * Note: We delay allocating a StringBuilder until we find out it is required.
	 * @param input String to cleanse
	 * @return The original String if no cleansing required - or a cleansed copy if necessary.
	 */
	public static String cleanse(final String input) {
		final int len = input.length();
		StringBuilder b = null;

		for (int i = 0; i < len; i++) {
			final char ch = input.charAt(i);
			// (U+2018) LEFT SINGLE QUOTATION MARK
			// (U+2019) RIGHT SINGLE QUOTATION MARK
			if (ch == '\u2018' || ch == '\u2019' || ch == '`') {
				if (b == null)
					b = new StringBuilder(len).append(input.substring(0, i));
				b.append('\'');
			}
			// (U+201C) LEFT DOUBLE QUOTATION MARK
			// (U+201D) RIGHT DOUBLE QUOTATION MARK
			else if (ch == '\u201C' || ch == '\u201D') {
				if (b == null)
					b = new StringBuilder(len).append(input.substring(0, i));
				b.append('\"');
			}
			// (U+2013) ENDASH
			// (U+2014) EMDASH
			else if (ch == '\u2013' || ch == '\u2014') {
				if (b == null)
					b = new StringBuilder(len).append(input.substring(0, i));
				b.append('-');
			}
			else if (b != null)
				b.append(ch);
		}

		return b != null ? b.toString() : input;
	}

	private static String version = Utils.class.getPackage().getImplementationVersion();

	/**
	 * Get the version of the FTA library.
	 * @return The version of the FTA library.
	 */
	public static String getVersion() {
		return version;
	}

	// Get a random digit string of length len digits, first must not be a zero
	public static String getRandomDigits(final SecureRandom random, final int len) {
		final StringBuilder b = new StringBuilder(len);
		b.append(random.nextInt(9) + 1);
		for (int i = 1; i < len; i++)
			b.append(random.nextInt(10));
		return b.toString();
	}

	public static String determineStreamFormat(final ObjectMapper mapper, final Map cardinality) {
		final int totalSamples = cardinality.size();

		if (totalSamples == 0)
			return null;

		final String HTML_CHECKER = "|]*\\s*>|(\\&(?:[\\w\\d]+|#\\d+|#x[a-f\\d]+);)";
		final Pattern patternHTML = Pattern.compile(HTML_CHECKER);

		int fmtJSON = 0;
		int fmtHTML = 0;
		int fmtXML = 0;
		int fmtBase64 = 0;
		int fmtRealBase64 = 0;
		int samples = 0;

		for (final String sample : cardinality.keySet()) {
			samples++;
			if (sample.length() < 2)
				continue;
			final char first = sample.charAt(0);
			final char last = sample.charAt(sample.length() - 1);
			if (first == '{' || first == '[' && first == last && samples - fmtJSON < 5) {
				try {
					mapper.readTree(sample);
					fmtJSON++;
					continue;
				} catch (IOException e) {
					// Ignore
				}
			}
			if (first == '<' && last == '>'&& samples - fmtXML < 5) {
				try {
					DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(new InputSource(new StringReader(sample)));
					fmtXML++;
				} catch (Exception e) {
				}
				continue;
			}
			if (first == '<' && samples - (fmtHTML + fmtXML) < 5) {
				if (patternHTML.matcher(sample).groupCount() != 0)
					fmtHTML++;
				continue;
			}
			if (sample.length() % 4 == 0 && samples - fmtBase64 < 5) {
				try {
					Base64.getDecoder().decode(sample);
					fmtBase64++;
					if (last == '=')
						fmtRealBase64++;
					continue;
				}
				catch (IllegalArgumentException e) {
					// Ignore
				}
			}
		}
		if (cardinality.size() == fmtJSON)
			return "JSON";
		else if (cardinality.size() == fmtXML)
			return "XML";
		else if (cardinality.size() == fmtHTML + fmtXML)
			return "HTML";
		else if (cardinality.size() == fmtBase64 && fmtRealBase64 != 0)
			return "Base64";

		return "OTHER";
	}

	public static long parseLong(final String input, final NumberFormat longFormatter) {
		final String trimmed = input.trim();
		final ParsePosition lPos = new ParsePosition(0);
		final String lParse = trimmed.charAt(0) == '+' ? trimmed.substring(1) : trimmed;
		final Number l = longFormatter.parse(lParse, lPos);

		if (l != null && lParse.length() == lPos.getIndex())
			return l.longValue();

		final int digits = lParse.length();
		if (digits >= 2 && lParse.charAt(digits - 1) == '-')
			return -Long.parseLong(lParse.substring(0, digits - 1));

		return Long.parseLong(lParse);
	}

	// NumberFormat.getInstance(locale) returns a parser that cannot cope with a set of sins including:
	// Exponents with a wrong case 'e' (e.g. 1234.0e5) or with a '+ (e.g. or 123E+5) or with a trailing minus.
	public static Double parseDouble(final String rawInput, final NumberFormat doubleFormatter) {
		final String trimmed = rawInput.trim();
		String cleaned = trimmed.charAt(0) == '+' ? trimmed.substring(1) : trimmed;
		final ParsePosition pos = new ParsePosition(0);
		Number n = doubleFormatter.parse(cleaned, pos);
		final int upto = pos.getIndex();
		final int len = cleaned.length();
		if (n != null && upto == len)
			return n.doubleValue();

		if (len >= 2 && cleaned.charAt(len - 1) == '-')
			try {
				return -doubleFormatter.parse(cleaned.substring(0, len - 1)).doubleValue();
			} catch (ParseException e) {
				throw new NumberFormatException(e.getMessage());
			}

		if (upto > len - 2)
			return Double.parseDouble(cleaned);

		// Did we trip up on the Exponent?
		final char exp = cleaned.charAt(upto);
		if (exp != 'E' && exp != 'e')
			return Double.parseDouble(cleaned);

		// Handle E+ which is not supported
		if (upto <= len - 3 && cleaned.charAt(upto + 1) == '+' && Character.isDigit(cleaned.charAt(upto + 2))) {
			pos.setIndex(0);
			final String updatedInput = cleaned.substring(0, upto + 1) + cleaned.substring(upto + 2);
			n = doubleFormatter.parse(updatedInput, pos);
			if (pos.getIndex() == updatedInput.length())
				return n.doubleValue();
			cleaned = updatedInput;
		}

		// Handle the wrong case for the Exponentiation character which is not supported
		final char nextCh = cleaned.charAt(upto + 1);
		if (Character.isDigit(nextCh) || (nextCh == '-' && len > upto + 2 && Character.isDigit(cleaned.charAt(upto + 2)))) {
			final char newExp = exp == 'E' ? 'e' : 'E';
			final String updatedInput = cleaned.substring(0, upto) + newExp + cleaned.substring(upto + 1);
			pos.setIndex(0);
			n = doubleFormatter.parse(updatedInput, pos);
			if (pos.getIndex() == updatedInput.length())
				return n.doubleValue();
		}

		return Double.parseDouble(cleaned);
	}

	/**
	 * Calculate the probability that the set of size 'samples' is unique given the sampleSample size.
	 * @param sampleSpace Size of the Sample Space
	 * @param samples number of samples observed
	 * @return The probability that the sample set is unique.
	 */
	public static double uniquenessProbability(final int sampleSpace, final int samples) {
		final double sampleSpaceD = sampleSpace;
		double numerator = sampleSpaceD;
		double result = 1.0;

		for (int i = sampleSpace; i > sampleSpace - samples; i--) {
			result = result * numerator / sampleSpaceD;
			numerator -= 1.0;
		}

		return 1.0 - result;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy