apoc.text.Strings Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of apoc-core Show documentation
Show all versions of apoc-core Show documentation
Core package for Neo4j Procedures
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package apoc.text;
import static apoc.util.Util.quote;
import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;
import apoc.util.Util;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.Transaction;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.UserFunction;
/**
* @author mh
* @since 05.05.16
*/
public class Strings {
private static final HammingDistance hammingDistance = new HammingDistance();
private static final JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
private static final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
@Context
public Transaction tx;
@UserFunction("apoc.text.indexOf")
@Description("Returns the first occurrence of the lookup `STRING` in the given `STRING`, or -1 if not found.")
public Long indexOf(
final @Name("text") String text,
final @Name("lookup") String lookup,
final @Name(value = "from", defaultValue = "0") long from,
@Name(value = "to", defaultValue = "-1") long to) {
if (text == null) return null;
if (lookup == null) return -1L;
if (to == -1L || to > text.length()) return (long) text.indexOf(lookup, (int) from);
if (to <= from) return -1L;
return (long) text.substring(0, (int) to).indexOf(lookup, (int) from);
}
@UserFunction("apoc.text.indexesOf")
@Description("Returns all occurrences of the lookup `STRING` in the given `STRING`, or an empty list if not found.")
public List indexesOf(
final @Name("text") String text,
final @Name("lookup") String lookup,
final @Name(value = "from", defaultValue = "0") long from,
@Name(value = "to", defaultValue = "-1") long to) {
if (text == null) return null;
if (lookup == null) return Collections.emptyList();
if (to == -1L) to = text.length();
List result = new ArrayList<>();
int idx = (int) from - 1;
while (true) {
idx = text.indexOf(lookup, idx + 1);
if (idx == -1 || idx >= to) {
return result;
} else {
result.add((long) idx);
}
}
}
@UserFunction("apoc.text.replace")
@Description("Finds and replaces all matches found by the given regular expression with the given replacement.")
public String replace(
final @Name("text") String text,
final @Name("regex") String regex,
final @Name("replacement") String replacement) {
return regreplace(text, regex, replacement);
}
@UserFunction("apoc.text.byteCount")
@Description("Returns the size of the given `STRING` in bytes.")
public long byteCount(
final @Name("text") String text, @Name(value = "charset", defaultValue = "UTF-8") String charset)
throws UnsupportedEncodingException {
return text.getBytes(charset).length;
}
@UserFunction("apoc.text.bytes")
@Description("Returns the given `STRING` as bytes.")
public List bytes(
final @Name("text") String text, @Name(value = "charset", defaultValue = "UTF-8") String charset)
throws UnsupportedEncodingException {
byte[] bytes = text.getBytes(charset);
List result = new ArrayList<>(bytes.length);
for (byte b : bytes) {
result.add((long) b & 0xFFL);
}
return result;
}
@UserFunction("apoc.text.regreplace")
@Description("Finds and replaces all matches found by the given regular expression with the given replacement.")
public String regreplace(
final @Name("text") String text,
final @Name("regex") String regex,
final @Name("replacement") String replacement) {
if (text == null || regex == null || replacement == null) {
return null;
}
return text.replaceAll(regex, replacement);
}
@UserFunction("apoc.text.split")
@Description("Splits the given `STRING` using a given regular expression as a separator.")
public List split(
final @Name("text") String text,
final @Name("regex") String regex,
final @Name(value = "limit", defaultValue = "0") Long limit) {
if (text == null || regex == null || limit == null) {
return null;
}
String[] resultArray = text.split(regex, limit.intValue());
return new ArrayList<>(asList(resultArray));
}
@UserFunction("apoc.text.regexGroups")
@Description("Returns all groups matching the given regular expression in the given text.")
public List> regexGroups(final @Name("text") String text, final @Name("regex") String regex) {
if (text == null || regex == null) {
return Collections.EMPTY_LIST;
} else {
final Pattern pattern = Pattern.compile(regex);
final Matcher matcher = pattern.matcher(text);
List> result = new ArrayList<>();
while (matcher.find()) {
List matchResult = new ArrayList<>();
for (int i = 0; i <= matcher.groupCount(); i++) {
matchResult.add(matcher.group(i));
}
result.add(matchResult);
}
return result;
}
}
@UserFunction("apoc.text.join")
@Description("Joins the given `STRING` values using the given delimiter.")
public String join(final @Name("texts") List texts, final @Name("delimiter") String delimiter) {
if (texts == null || delimiter == null) {
return null;
}
return String.join(delimiter, texts);
}
@UserFunction("apoc.text.clean")
@Description(
"Strips the given `STRING` of everything except alpha numeric characters and converts it to lower case.")
public String clean(final @Name("text") String text) {
return text == null ? null : removeNonWordCharacters(text);
}
@UserFunction("apoc.text.compareCleaned")
@Description(
"Compares two given `STRING` values stripped of everything except alpha numeric characters converted to lower case.")
public boolean compareCleaned(final @Name("text1") String text1, final @Name("text2") String text2) {
if (text1 == null || text2 == null) {
return false;
}
return removeNonWordCharacters(text1).equals(removeNonWordCharacters(text2));
}
@UserFunction("apoc.text.distance")
@Description("Compares the two given `STRING` values using the Levenshtein distance algorithm.")
public Long distance(final @Name("text1") String text1, @Name("text2") final String text2) {
return levenshteinDistance(text1, text2);
}
@UserFunction("apoc.text.levenshteinDistance")
@Description("Compares the given `STRING` values using the Levenshtein distance algorithm.")
public Long levenshteinDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long) levenshteinDistance.apply(text1, text2);
}
@UserFunction("apoc.text.levenshteinSimilarity")
@Description(
"Returns the similarity (a value within 0 and 1) between the two given `STRING` values based on the Levenshtein distance algorithm.")
public Double levenshteinSimilarity(final @Name("text1") String text1, @Name("text2") final String text2) {
if (text1 == null || text2 == null) {
return null;
}
int longerLength = Math.max(text1.length(), text2.length());
if (longerLength == 0) {
return 1.0;
}
long editDistance = distance(text1, text2);
return (longerLength - editDistance) / (double) longerLength;
}
@UserFunction("apoc.text.hammingDistance")
@Description("Compares the two given `STRING` values using the Hamming distance algorithm.")
public Long hammingDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return (long) hammingDistance.apply(text1, text2);
}
@UserFunction("apoc.text.jaroWinklerDistance")
@Description("Compares the two given `STRING` values using the Jaro-Winkler distance algorithm.")
public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
if (text1 == null || text2 == null) {
return null;
}
return jaroWinklerDistance.apply(text1, text2);
}
@UserFunction("apoc.text.sorensenDiceSimilarity")
@Description(
"Compares the two given `STRING` values using the Sørensen–Dice coefficient formula, with the provided IETF language tag.")
public Double sorensenDiceSimilarity(
final @Name("text1") String text1,
final @Name("text2") String text2,
final @Name(value = "languageTag", defaultValue = "en") String languageTag) {
if (text1 == null || text2 == null || languageTag == null) {
return null;
}
return SorensenDiceCoefficient.compute(text1, text2, languageTag);
}
@UserFunction("apoc.text.fuzzyMatch")
@Description("Performs a fuzzy match search of the two given `STRING` values.")
public Boolean fuzzyMatch(final @Name("text1") String text1, @Name("text2") final String text2) {
if (text1 == null || text2 == null) {
return null;
}
int termLength = text1.length();
int maxDistanceAllowed = termLength < 3 ? 0 : termLength < 5 ? 1 : 2;
Long distance = distance(text1, text2);
return distance <= maxDistanceAllowed;
}
@UserFunction("apoc.text.urlencode")
@Description("Encodes the given URL `STRING`.")
public String urlencode(@Name("text") String text) {
try {
return URLEncoder.encode(text, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("urlencoding failed", e);
}
}
@UserFunction("apoc.text.urldecode")
@Description("Decodes the given URL encoded `STRING`.")
public String urldecode(@Name("text") String text) {
try {
return URLDecoder.decode(text, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("urldecoding failed", e);
}
}
private static Pattern cleanPattern = Pattern.compile("[^\\p{L}\\p{N}]+");
private static Pattern specialCharPattern = Pattern.compile("\\p{IsM}+");
private static String[][] UMLAUT_REPLACEMENTS = {
{new String("Ä"), "Ae"},
{new String("Ü"), "Ue"},
{new String("Ö"), "Oe"},
{new String("ä"), "ae"},
{new String("ü"), "ue"},
{new String("ö"), "oe"},
{new String("ß"), "ss"}
};
private static String removeNonWordCharacters(String s) {
String result = s;
for (int i = 0; i < UMLAUT_REPLACEMENTS.length; i++) {
result = result.replace(UMLAUT_REPLACEMENTS[i][0], UMLAUT_REPLACEMENTS[i][1]);
}
result = Normalizer.normalize(result, Normalizer.Form.NFD);
String tmp2 = specialCharPattern.matcher(result).replaceAll("");
return cleanPattern.matcher(tmp2).replaceAll("").toLowerCase();
}
@UserFunction("apoc.text.lpad")
@Description("Left pads the given `STRING` by the given width.")
public String lpad(
@Name("text") String text,
@Name("count") long count,
@Name(value = "delimiter", defaultValue = " ") String delim) {
int len = text.length();
if (len >= count) return text;
StringBuilder sb = new StringBuilder((int) count);
char[] chars = new char[(int) count - len];
Arrays.fill(chars, delim.charAt(0));
sb.append(chars);
sb.append(text);
return sb.toString();
}
@UserFunction("apoc.text.rpad")
@Description("Right pads the given `STRING` by the given width.")
public String rpad(
@Name("text") String text,
@Name("count") long count,
@Name(value = "delimiter", defaultValue = " ") String delim) {
int len = text.length();
if (len >= count) return text;
StringBuilder sb = new StringBuilder(text);
char[] chars = new char[(int) count - len];
Arrays.fill(chars, delim.charAt(0));
sb.append(chars);
return sb.toString();
}
@UserFunction("apoc.text.format")
@Description("Formats the given `STRING` with the given parameters.")
public String format(
@Name("text") String text,
@Name("params") List