be.bagofwords.util.StringUtils Maven / Gradle / Ivy
package be.bagofwords.util;
import be.bagofwords.text.*;
import it.unimi.dsi.fastutil.chars.CharArrayList;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtils extends org.apache.commons.lang3.StringUtils {
public static final int NUM_OF_PADDED_SPACES = 1;
static final char[] quotes = {'"', '\'', '‘', '’', '“', '”'};
private static final CharArrayList characterMapping = new CharArrayList();
/**
* Remove slashes from inputString
*/
private static final Map escapeFileNameMapping;
private static final Map escapeTab;
private static final Map escapePath;
private static final Map escapeNewLine;
static {
escapeFileNameMapping = new HashMap<>();
escapeFileNameMapping.put('/', '+');
escapeFileNameMapping.put('\\', '-');
escapeTab = new HashMap<>();
escapeTab.put('\t', ' ');
escapePath = new HashMap<>();
escapePath.put('/', '-');
escapePath.put('\r', '+');
escapePath.put('\n', '$');
escapePath.put('\t', '§');
escapePath.put(' ', '.');
escapeNewLine = new HashMap<>();
escapeNewLine.put('\r', 'r');
escapeNewLine.put('\n', 'n');
}
private static final String[] topLevelDomains = {"com", "co.uk", "net", "org", "nl", "be", "me", "nu"};
private static final List certainErrorRegex = Arrays.asList(".*\\d+[A-Za-z]{3,}.*", ".*\\d+\\.[A-Za-z]{3,}.*");
private static Pattern namePattern = Pattern.compile("([^a-zA-Z]*[A-Z][a-z]*[^a-zA-Z]*)+");
public static void normalizeQuotationMarks(char[] extracted) {
for (int pos = 0; pos < extracted.length; pos++) {
boolean foundQuote = false;
for (int i = 0; i < quotes.length && !foundQuote; i++) {
if (quotes[i] == extracted[pos]) {
foundQuote = true;
extracted[pos] = '\'';
}
}
}
}
public static boolean isASCIIVowel(char c) {
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y';
}
public static boolean isASCIIConsonant(char c) {
return c == 'b' || c == 'c' || c == 'd' || c == 'f' || c == 'g' || c == 'h' || c == 'j' || c == 'k' || c == 'l' || c == 'm' || c == 'n' || c == 'p' || c == 'q' || c == 'r' || c == 's'
|| c == 't' || c == 'v' || c == 'w' || c == 'x' || c == 'z';
}
public static String removeAccentsSlow(String str) {
if (str.contains("Ø") || str.contains("ø")) {
//Normalizer does not seem to work for this character. Does it fail on other characters also?
str = str.replaceAll("ø", "o").replaceAll("Ø", "O");
}
String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
return pattern.matcher(nfdNormalizedString).replaceAll("");
}
public static void removeHTML(MappedText mappedText) {
StringUtils.replaceScripts(mappedText);
StringUtils.replaceAll("