org.elasticsearch.hadoop.util.StringUtils Maven / Gradle / Ivy
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.hadoop.util;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.codehaus.jackson.io.JsonStringEncoder;
import org.elasticsearch.hadoop.EsHadoopIllegalArgumentException;
import org.elasticsearch.hadoop.serialization.json.BackportedJsonStringEncoder;
/**
* Utility class around Strings. Used to remove dependency on other libraries that might (or not) be available at runtime.
*/
public abstract class StringUtils {
public static final Charset UTF_8 = Charset.forName("UTF-8");
public static final String EMPTY = "";
public static final String[] EMPTY_ARRAY = new String[0];
public static final String DEFAULT_DELIMITER = ",";
private static final boolean HAS_JACKSON_CLASS = ObjectUtils.isClassPresent("org.codehaus.jackson.io.JsonStringEncoder", StringUtils.class.getClassLoader());
public static boolean hasLength(CharSequence sequence) {
return (sequence != null && sequence.length() > 0);
}
public static boolean hasText(CharSequence sequence) {
if (!hasLength(sequence)) {
return false;
}
int length = sequence.length();
for (int i = 0; i < length; i++) {
if (!Character.isWhitespace(sequence.charAt(i))) {
return true;
}
}
return false;
}
public static int countOccurrences(String string, String substring) {
if (string == null || substring == null || string.length() == 0 || substring.length() == 0) {
return 0;
}
int count = 0;
int currentPosition = 0;
int index;
while ((index = string.indexOf(substring, currentPosition)) != -1) {
++count;
currentPosition = index + substring.length();
}
return count;
}
public static List tokenize(String string) {
return tokenize(string, ",");
}
public static List tokenize(String string, String delimiters) {
return tokenize(string, delimiters, true, true);
}
public static List tokenizeAndUriDecode(String string, String delimiters) {
List tokenize = tokenize(string, delimiters, true, true);
List decoded = new ArrayList(tokenize.size());
for (String token : tokenize) {
decoded.add(StringUtils.decodeQuery(token));
}
return decoded;
}
public static List tokenize(String string, String delimiters, boolean trimTokens, boolean ignoreEmptyTokens) {
if (string == null) {
return Collections.emptyList();
}
StringTokenizer st = new StringTokenizer(string, delimiters);
List tokens = new ArrayList();
while (st.hasMoreTokens()) {
String token = st.nextToken();
if (trimTokens) {
token = token.trim();
}
if (!ignoreEmptyTokens || token.length() > 0) {
tokens.add(token);
}
}
return tokens;
}
public static String concatenate(Collection> list, String delimiter) {
if (list == null || list.isEmpty()) {
return EMPTY;
}
if (delimiter == null) {
delimiter = EMPTY;
}
StringBuilder sb = new StringBuilder();
for (Object object : list) {
sb.append(object.toString());
sb.append(delimiter);
}
sb.setLength(sb.length() - delimiter.length());
return sb.toString();
}
public static String concatenateAndUriEncode(Collection> list, String delimiter) {
Collection escaped = new ArrayList();
if (list != null) {
for (Object object : list) {
escaped.add(encodeQuery(object.toString()));
}
}
return concatenate(escaped, delimiter);
}
public static String concatenate(Object[] array, String delimiter) {
if (array == null || array.length == 0) {
return EMPTY;
}
if (delimiter == null) {
delimiter = EMPTY;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < array.length; i++) {
if (i > 0) {
sb.append(delimiter);
}
sb.append(array[i]);
}
return sb.toString();
}
public static String deleteWhitespace(CharSequence sequence) {
if (!hasLength(sequence)) {
return EMPTY;
}
StringBuilder sb = new StringBuilder(sequence.length());
for (int i = 0; i < sequence.length(); i++) {
char currentChar = sequence.charAt(i);
if (!Character.isWhitespace(currentChar)) {
sb.append(currentChar);
}
}
// return the initial String if no whitespace is found
return (sb.length() == sequence.length() ? sequence.toString() : sb.toString());
}
public static String asUTFString(byte[] content) {
return asUTFString(content, 0, content.length);
}
public static String asUTFString(byte[] content, int offset, int length) {
return (content == null || length == 0 ? EMPTY : new String(content, offset, length, UTF_8));
}
public static byte[] toUTF(String string) {
return string.getBytes(UTF_8);
}
// Based on "Algorithms on Strings, Trees and Sequences by Dan Gusfield".
// returns -1 if the two strings are within the given threshold of each other, -1 otherwise
public static int levenshteinDistance(CharSequence one, CharSequence another, int threshold) {
int n = one.length();
int m = another.length();
// if one string is empty, the edit distance is necessarily the length of the other
if (n == 0) {
return m <= threshold ? m : -1;
}
else if (m == 0) {
return n <= threshold ? n : -1;
}
if (n > m) {
// swap the two strings to consume less memory
final CharSequence tmp = one;
one = another;
another = tmp;
n = m;
m = another.length();
}
int p[] = new int[n + 1]; // 'previous' cost array, horizontally
int d[] = new int[n + 1]; // cost array, horizontally
int _d[]; // placeholder to assist in swapping p and d
// fill in starting table values
final int boundary = Math.min(n, threshold) + 1;
for (int i = 0; i < boundary; i++) {
p[i] = i;
}
// these fills ensure that the value above the rightmost entry of our
// stripe will be ignored in following loop iterations
Arrays.fill(p, boundary, p.length, Integer.MAX_VALUE);
Arrays.fill(d, Integer.MAX_VALUE);
for (int j = 1; j <= m; j++) {
final char t_j = another.charAt(j - 1);
d[0] = j;
// compute stripe indices, constrain to array size
final int min = Math.max(1, j - threshold);
final int max = (j > Integer.MAX_VALUE - threshold) ? n : Math.min(n, j + threshold);
// the stripe may lead off of the table if s and t are of different sizes
if (min > max) {
return -1;
}
// ignore entry left of leftmost
if (min > 1) {
d[min - 1] = Integer.MAX_VALUE;
}
// iterates through [min, max] in s
for (int i = min; i <= max; i++) {
if (one.charAt(i - 1) == t_j) {
// diagonally left and up
d[i] = p[i - 1];
}
else {
// 1 + minimum of cell to the left, to the top, diagonally left and up
d[i] = 1 + Math.min(Math.min(d[i - 1], p[i]), p[i - 1]);
}
}
// copy current distance counts to 'previous row' distance counts
_d = p;
p = d;
d = _d;
}
// if p[n] is greater than the threshold, there's no guarantee on it being the correct
// distance
if (p[n] <= threshold) {
return p[n];
}
return -1;
}
public static List findSimiliar(CharSequence match, Collection potential) {
List list = new ArrayList(3);
// 1 switches or 1 extra char
int maxDistance = 2;
for (String string : potential) {
int dist = levenshteinDistance(match, string, maxDistance);
if (dist >= 0) {
if (dist < maxDistance) {
maxDistance = dist;
list.clear();
list.add(string);
}
else if (dist == maxDistance) {
list.add(string);
}
}
}
return list;
}
public static String sanitizeResource(String resource) {
String res = resource.trim();
if (res.startsWith("/")) {
res = res.substring(1);
}
if (res.endsWith("/")) {
res = res.substring(0, res.length() - 1);
}
return res;
}
public static String encodeUri(String uri) {
try {
return URIUtil.encodePathQuery(uri);
} catch (URIException ex) {
throw new EsHadoopIllegalArgumentException("Cannot escape uri" + uri);
}
}
public static String encodePath(String path) {
try {
return URIUtil.encodePath(path, "UTF-8");
} catch (URIException ex) {
throw new EsHadoopIllegalArgumentException("Cannot encode path" + path, ex);
}
}
public static String decodePath(String path) {
try {
return URIUtil.decode(path, "UTF-8");
} catch (URIException ex) {
throw new EsHadoopIllegalArgumentException("Cannot encode path" + path, ex);
}
}
public static String encodeQuery(String query) {
try {
return URLEncoder.encode(query, "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new EsHadoopIllegalArgumentException("Cannot encode path" + query, ex);
}
}
public static String decodeQuery(String query) {
try {
return URLDecoder.decode(query, "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new EsHadoopIllegalArgumentException("Cannot encode path" + query, ex);
}
}
public static boolean isLowerCase(CharSequence string) {
for (int index = 0; index < string.length(); index++) {
if (Character.isUpperCase(string.charAt(index))) {
return false;
}
}
return true;
}
public static String jsonEncoding(String rawString) {
return new String(HAS_JACKSON_CLASS ? JacksonStringEncoder.jsonEncoding(rawString) : BackportedJsonStringEncoder.getInstance().quoteAsString(rawString));
}
// return the value in a JSON friendly way
public static String toJsonString(Object value) {
if (value == null) {
return "null";
}
else if (value.getClass().equals(String.class)) {
return "\"" + StringUtils.jsonEncoding(value.toString()) + "\"";
}
// else it's a Boolean or Number so no escaping or quotes
else {
return value.toString();
}
}
private static class JacksonStringEncoder {
public static char[] jsonEncoding(String rawString) {
return JsonStringEncoder.getInstance().quoteAsString(rawString);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy