com.univocity.parsers.common.NormalizedString Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of univocity-parsers Show documentation
Show all versions of univocity-parsers Show documentation
uniVocity's open source parsers for processing different text formats using a consistent API
/*******************************************************************************
* Copyright 2019 Univocity Software Pty Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package com.univocity.parsers.common;
import java.io.*;
import java.util.*;
import static com.univocity.parsers.common.ArgumentUtils.*;
/**
* A {@code NormalizedString} allows representing text in a normalized fashion. Strings
* with different character case or surrounding whitespace are considered the same.
*
* Used to represent groups of fields, where users may refer to their names using
* different character cases or whitespaces.
*
* Where the character case or the surrounding space is relevant, the {@code NormalizedString}
* will have its {@link #isLiteral()} method return {@code true}, meaning the exact
* character case and surrounding whitespaces are required for matching it.
*
* Invoking {@link #valueOf(String)} with a {@code String} surrounded by single quotes
* will create a literal {@code NormalizedString}. Use {@link #literalValueOf(String)}
* to obtain the same {@code NormalizedString} without having to introduce single quotes.
*
*/
public final class NormalizedString implements Serializable, Comparable, CharSequence {
private static final long serialVersionUID = -3904288692735859811L;
private static final StringCache stringCache = new StringCache() {
@Override
protected NormalizedString process(String input) {
if (input == null) {
return null;
}
return new NormalizedString(input);
}
};
private final String original;
private final String normalized;
private final boolean literal;
private final int hashCode;
private NormalizedString(String string) {
String trimmed = string.trim();
if (trimmed.length() > 2 && trimmed.charAt(0) == '\'' && trimmed.charAt(trimmed.length() - 1) == '\'') {
this.original = string.substring(1, string.length() - 1);
this.normalized = original;
this.hashCode = normalize(original).hashCode();
this.literal = true;
} else {
this.original = string;
this.normalized = normalize(original);
this.hashCode = normalized.hashCode();
this.literal = false;
}
}
private String normalize(Object value) {
String str = String.valueOf(value);
str = str.trim().toLowerCase();
return str;
}
public boolean isLiteral() {
return literal;
}
@Override
public boolean equals(Object anObject) {
if (anObject == this) {
return true;
}
if (anObject == null) {
return false;
}
if (anObject instanceof NormalizedString) {
NormalizedString other = (NormalizedString) anObject;
if (this.literal || other.literal) {
return original.equals(other.original);
}
return this.normalized.equals(other.normalized);
}
if (literal) {
return original.equals(String.valueOf(anObject));
} else {
return normalized.equals(normalize(anObject));
}
}
@Override
public int hashCode() {
return hashCode;
}
@Override
public int length() {
return original.length();
}
@Override
public char charAt(int index) {
return original.charAt(index);
}
@Override
public CharSequence subSequence(int start, int end) {
return original.subSequence(start, end);
}
@Override
public int compareTo(NormalizedString o) {
if (o == this) {
return 0;
}
if (this.literal || o.literal) {
return original.compareTo(o.original);
}
return this.normalized.compareTo(o.normalized);
}
/**
* Compares a {@code NormalizedString} against a {@code String} lexicographically.
* @param o a plain {@code String}
* @return the result of {@link String#compareTo(String)}. If this {@code NormalizedString}
* is a literal, the original argument string will be compared. If this {@code NormalizedString}
* is not a literal, the result will be from the comparison of the normalized content of both strings
* (i.e. surrounding whitespaces and character case differences will be ignored).
*/
public int compareTo(String o) {
return compareTo(valueOf(o));
}
@Override
public String toString() {
return original;
}
/**
* Creates a literal {@code NormalizedString}, meaning it will only match with
* other {@code String} or {@code NormalizedString} if they have the exact same content
* including character case and surrounding whitespaces.
*
* @param string the input {@code String}
* @return the literal {@code NormalizedString} version of the given string.
*/
public static NormalizedString literalValueOf(String string) {
if (string == null) {
return null;
}
return stringCache.get('\'' + string + "\'");
}
/**
* Creates a non-literal {@code NormalizedString}, meaning it will match with
* other {@code String} or {@code NormalizedString} regardless of different
* including character case and surrounding whitespaces.
*
* If the input value is enclosed with single quotes, a literal {@code NormalizedString}
* will be returned, as described in {@link #literalValueOf(String)}
*
* @param o the input object whose {@code String} representation will be used
* @return the {@code NormalizedString} of the given object.
*/
public static NormalizedString valueOf(Object o) {
if (o == null) {
return null;
}
return stringCache.get(o.toString());
}
/**
* Creates a non-literal {@code NormalizedString}, meaning it will match with
* other {@code String} or {@code NormalizedString} regardless of different
* including character case and surrounding whitespaces.
*
* If the input string is enclosed with single quotes, a literal {@code NormalizedString}
* will be returned, as described in {@link #literalValueOf(String)}
*
* @param string the input string
* @return the {@code NormalizedString} of the given string.
*/
public static NormalizedString valueOf(String string) {
if (string == null) {
return null;
}
return stringCache.get(string);
}
/**
* Converts a {@code NormalizedString} back to its original {@code String} representation
* @param string the normalized string
* @return the original string used to create the given normalized representation.
*/
public static String valueOf(NormalizedString string) {
if (string == null) {
return null;
}
return string.original;
}
/**
* Converts a collection of plain strings into an array of {@code NormalizedString}
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static NormalizedString[] toArray(Collection args) {
if (args == null) {
throw new IllegalArgumentException("String collection cannot be null");
}
NormalizedString[] out = new NormalizedString[args.size()];
Iterator it = args.iterator();
for (int i = 0; i < out.length; i++) {
out[i] = valueOf(it.next());
}
return out;
}
/**
* Converts a collection of normalized strings into an array of {@code String}
* @param args the normalized strings to convert back to to {@code String}
* @return the {@code String} representations of all normalized strings.
*/
public static String[] toStringArray(Collection args) {
if (args == null) {
throw new IllegalArgumentException("String collection cannot be null");
}
String[] out = new String[args.size()];
Iterator it = args.iterator();
for (int i = 0; i < out.length; i++) {
out[i] = valueOf(it.next());
}
return out;
}
/**
* Converts multiple plain strings into an array of {@code NormalizedString}, ensuring
* no duplicate {@code NormalizedString} elements exist, even if their original {@code String}s
* are different.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static NormalizedString[] toUniqueArray(String... args) {
notEmpty("Element array", args);
NormalizedString[] out = toArray(args);
NormalizedString[] duplicates = findDuplicates(out);
if (duplicates.length > 0) {
throw new IllegalArgumentException("Duplicate elements found: " + Arrays.toString(duplicates));
}
return out;
}
/**
* Converts multiple plain strings into an array of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static NormalizedString[] toArray(String... args) {
if (args == null) {
return null;
} else if (args.length == 0) {
return EMPTY_NORMALIZED_STRING_ARRAY;
}
NormalizedString[] out = new NormalizedString[args.length];
for (int i = 0; i < args.length; i++) {
out[i] = valueOf(args[i]);
}
return out;
}
/**
* Converts multiple normalized strings into an array of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the {@code String} representations of all input strings.
*/
public static String[] toArray(NormalizedString... args) {
if (args == null) {
return null;
} else if (args.length == 0) {
return EMPTY_STRING_ARRAY;
}
String[] out = new String[args.length];
for (int i = 0; i < args.length; i++) {
out[i] = valueOf(args[i]);
}
return out;
}
private static > T getCollection(T out, String... args) {
Collections.addAll(out, toArray(args));
return out;
}
private static > T getCollection(T out, Collection args) {
Collections.addAll(out, toArray(args));
return out;
}
private static > T getCollection(T out, NormalizedString... args) {
Collections.addAll(out, toArray(args));
return out;
}
private static > T getStringCollection(T out, Collection args) {
Collections.addAll(out, toStringArray(args));
return out;
}
/**
* Converts multiple plain strings into an {@code ArrayList} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static ArrayList toArrayList(String... args) {
return getCollection(new ArrayList(), args);
}
/**
* Converts multiple plain strings into an {@code ArrayList} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static ArrayList toArrayList(Collection args) {
return getCollection(new ArrayList(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static ArrayList toArrayListOfStrings(NormalizedString... args) {
return getCollection(new ArrayList(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static ArrayList toArrayListOfStrings(Collection args) {
return getStringCollection(new ArrayList(), args);
}
/**
* Converts multiple plain strings into a {@code TreeSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static TreeSet toTreeSet(String... args) {
return getCollection(new TreeSet(), args);
}
/**
* Converts multiple plain strings into a {@code TreeSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static TreeSet toTreeSet(Collection args) {
return getCollection(new TreeSet(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static TreeSet toTreeSetOfStrings(NormalizedString... args) {
return getCollection(new TreeSet(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static TreeSet toTreeSetOfStrings(Collection args) {
return getStringCollection(new TreeSet(), args);
}
/**
* Converts multiple plain strings into a {@code HashSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static HashSet toHashSet(String... args) {
return getCollection(new HashSet(), args);
}
/**
* Converts multiple plain strings into a {@code HashSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static HashSet toHashSet(Collection args) {
return getCollection(new HashSet(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static HashSet toHashSetOfStrings(NormalizedString... args) {
return getCollection(new HashSet(), args);
}
/**
* Converts multiple normalized strings into a {@code HashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static HashSet toHashSetOfStrings(Collection args) {
return getStringCollection(new HashSet(), args);
}
/**
* Converts multiple plain strings into a {@code LinkedHashSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static LinkedHashSet toLinkedHashSet(String... args) {
return getCollection(new LinkedHashSet(), args);
}
/**
* Converts multiple plain strings into a {@code LinkedHashSet} of {@code NormalizedString}.
*
* @param args the strings to convert to {@code NormalizedString}
* @return the {@code NormalizedString} representations of all input strings.
*/
public static LinkedHashSet toLinkedHashSet(Collection args) {
return getCollection(new LinkedHashSet(), args);
}
/**
* Converts multiple normalized strings into a {@code LinkedHashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static LinkedHashSet toLinkedHashSetOfStrings(NormalizedString... args) {
return getCollection(new LinkedHashSet(), args);
}
/**
* Converts multiple normalized strings into a {@code LinkedHashSet} of {@code String}.
*
* @param args the normalized strings to convert to {@code String}
* @return the original {@code String}s of all input normalized strings.
*/
public static LinkedHashSet toLinkedHashSetOfStrings(Collection args) {
return getStringCollection(new LinkedHashSet(), args);
}
/**
* Returns the literal representation of this {@code NormalizedString}, meaning it will only match with
* other {@code String} or {@code NormalizedString} if they have the exact same content
* including character case and surrounding whitespaces.
*
* @return the literal representation of the current {@code NormalizedString}
*/
public NormalizedString toLiteral() {
if (literal) {
return this;
}
return literalValueOf(this.original);
}
/**
* Analyzes a group of NormalizedString to identify any instances whose normalized content will generate
* clashes. Any clashing entries will be converted to their literal counterparts (using {@link #toLiteral()}),
* making it possible to identify one from the other.
*
* @param strings a group of identifiers that may contain ambiguous entries if their character case or surrounding whitespaces is not considered.
* This array will be modified.
*
* @return the input string array, with {@code NormalizedString} literals in the positions where clashes would originally occur.
*/
public static NormalizedString[] toIdentifierGroupArray(NormalizedString[] strings) {
identifyLiterals(strings);
return strings;
}
/**
* Analyzes a group of String to identify any instances whose normalized content will generate
* clashes. Any clashing entries will be converted to their literal counterparts (using {@link #toLiteral()}),
* making it possible to identify one from the other.
*
* @param strings a group of identifiers that may contain ambiguous entries if their character case or surrounding whitespaces is not considered.
*
*
* @return a {@code NormalizedString} array with literals in the positions where clashes would originally occur.
*/
public static NormalizedString[] toIdentifierGroupArray(String[] strings) {
NormalizedString[] out = toArray(strings);
identifyLiterals(out, false, false);
return out;
}
/**
* Analyzes a group of NormalizedString to identify any instances whose normalized content will generate
* clashes. Any clashing entries will be converted to their literal counterparts (using {@link #toLiteral()}),
* making it possible to identify one from the other.
*
* @param strings a group of identifiers that may contain ambiguous entries if their character case or surrounding whitespaces is not considered.
* This array will be modified.
*
* @return {@code true} if any entry has been modified to be a literal, otherwise {@code false}
*
*/
public static boolean identifyLiterals(NormalizedString[] strings) {
return identifyLiterals(strings, false, false);
}
/**
* Analyzes a group of NormalizedString to identify any instances whose normalized content will generate
* clashes. Any clashing entries will be converted to their literal counterparts (using {@link #toLiteral()}),
* making it possible to identify one from the other.
*
* @param strings a group of identifiers that may contain ambiguous entries if their character case or surrounding whitespaces is not considered.
* This array will be modified.
*
* @param lowercaseIdentifiers flag indicating that identifiers are stored in lower case (for compatibility with databases).
* If a string has a uppercase character, it means it must become a literal.
* @param uppercaseIdentifiers flag indicating that identifiers are stored in upper case (for compatibility with databases).
* If a string has a lowercase character, it means it must become a literal.
*
* @return {@code true} if any entry has been modified to be a literal, otherwise {@code false}
*
*/
public static boolean identifyLiterals(NormalizedString[] strings, boolean lowercaseIdentifiers, boolean uppercaseIdentifiers) {
if (strings == null) {
return false;
}
TreeMap normalizedMap = new TreeMap();
boolean modified = false;
for (int i = 0; i < strings.length; i++) {
NormalizedString string = strings[i];
if (string == null || string.isLiteral()) {
continue;
}
if (shouldBeLiteral(string.original, lowercaseIdentifiers, uppercaseIdentifiers)) {
strings[i] = NormalizedString.literalValueOf(string.original);
continue;
}
Object[] clashing = normalizedMap.get(string);
if (clashing != null && !string.original.equals(((NormalizedString) clashing[0]).original)) {
strings[i] = NormalizedString.literalValueOf(string.original);
strings[(Integer) clashing[1]] = ((NormalizedString) clashing[0]).toLiteral();
modified = true;
} else {
normalizedMap.put(string, new Object[]{string, i});
}
}
return modified;
}
private static boolean shouldBeLiteral(String string, boolean lowercaseIdentifiers, boolean uppercaseIdentifiers) {
if (lowercaseIdentifiers || uppercaseIdentifiers) {
for (int i = 0; i < string.length(); i++) {
char ch = string.charAt(i);
if ((uppercaseIdentifiers && !Character.isUpperCase(ch)) || (lowercaseIdentifiers && !Character.isLowerCase(ch))) {
return true;
}
}
}
return false;
}
/**
* Returns the internal string cache to allow users to tweak its size limit or clear it when appropriate
* @return the string cache used to store {@code NormalizedString} instances associated with their original {@code String}.
*/
public static StringCache getCache(){
return stringCache;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy