apoc.text.Strings Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of apoc-core Show documentation
Core package for Neo4j Procedures
There is a newer version: 5.24.0
/*
 * Copyright (c) "Neo4j"
 * Neo4j Sweden AB [http://neo4j.com]
 *
 * This file is part of Neo4j.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package apoc.text;

import static apoc.util.Util.quote;
import static java.lang.Math.toIntExact;
import static java.util.Arrays.asList;

import apoc.util.Util;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.JaroWinklerDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.Transaction;
import org.neo4j.procedure.Context;
import org.neo4j.procedure.Description;
import org.neo4j.procedure.Name;
import org.neo4j.procedure.UserFunction;

/**
 * @author mh
 * @since 05.05.16
 */
public class Strings {

    private static final HammingDistance hammingDistance = new HammingDistance();
    private static final JaroWinklerDistance jaroWinklerDistance = new JaroWinklerDistance();
    private static final LevenshteinDistance levenshteinDistance = new LevenshteinDistance();

    @Context
    public Transaction tx;

    @UserFunction("apoc.text.indexOf")
    @Description("Returns the first occurrence of the lookup `STRING` in the given `STRING`, or -1 if not found.")
    public Long indexOf(
            final @Name("text") String text,
            final @Name("lookup") String lookup,
            final @Name(value = "from", defaultValue = "0") long from,
            @Name(value = "to", defaultValue = "-1") long to) {
        if (text == null) return null;
        if (lookup == null) return -1L;
        if (to == -1L || to > text.length()) return (long) text.indexOf(lookup, (int) from);
        if (to <= from) return -1L;
        return (long) text.substring(0, (int) to).indexOf(lookup, (int) from);
    }

    @UserFunction("apoc.text.indexesOf")
    @Description("Returns all occurrences of the lookup `STRING` in the given `STRING`, or an empty list if not found.")
    public List indexesOf(
            final @Name("text") String text,
            final @Name("lookup") String lookup,
            final @Name(value = "from", defaultValue = "0") long from,
            @Name(value = "to", defaultValue = "-1") long to) {
        if (text == null) return null;
        if (lookup == null) return Collections.emptyList();
        if (to == -1L) to = text.length();
        List result = new ArrayList<>();
        int idx = (int) from - 1;
        while (true) {
            idx = text.indexOf(lookup, idx + 1);
            if (idx == -1 || idx >= to) {
                return result;
            } else {
                result.add((long) idx);
            }
        }
    }

    @UserFunction("apoc.text.replace")
    @Description("Finds and replaces all matches found by the given regular expression with the given replacement.")
    public String replace(
            final @Name("text") String text,
            final @Name("regex") String regex,
            final @Name("replacement") String replacement) {
        return regreplace(text, regex, replacement);
    }

    @UserFunction("apoc.text.byteCount")
    @Description("Returns the size of the given `STRING` in bytes.")
    public long byteCount(
            final @Name("text") String text, @Name(value = "charset", defaultValue = "UTF-8") String charset)
            throws UnsupportedEncodingException {
        return text.getBytes(charset).length;
    }

    @UserFunction("apoc.text.bytes")
    @Description("Returns the given `STRING` as bytes.")
    public List bytes(
            final @Name("text") String text, @Name(value = "charset", defaultValue = "UTF-8") String charset)
            throws UnsupportedEncodingException {
        byte[] bytes = text.getBytes(charset);
        List result = new ArrayList<>(bytes.length);
        for (byte b : bytes) {
            result.add((long) b & 0xFFL);
        }
        return result;
    }

    @UserFunction("apoc.text.regreplace")
    @Description("Finds and replaces all matches found by the given regular expression with the given replacement.")
    public String regreplace(
            final @Name("text") String text,
            final @Name("regex") String regex,
            final @Name("replacement") String replacement) {
        if (text == null || regex == null || replacement == null) {
            return null;
        }
        return text.replaceAll(regex, replacement);
    }

    @UserFunction("apoc.text.split")
    @Description("Splits the given `STRING` using a given regular expression as a separator.")
    public List split(
            final @Name("text") String text,
            final @Name("regex") String regex,
            final @Name(value = "limit", defaultValue = "0") Long limit) {
        if (text == null || regex == null || limit == null) {
            return null;
        }
        String[] resultArray = text.split(regex, limit.intValue());
        return new ArrayList<>(asList(resultArray));
    }

    @UserFunction("apoc.text.regexGroups")
    @Description("Returns all groups matching the given regular expression in the given text.")
    public List> regexGroups(final @Name("text") String text, final @Name("regex") String regex) {
        if (text == null || regex == null) {
            return Collections.EMPTY_LIST;
        } else {
            final Pattern pattern = Pattern.compile(regex);
            final Matcher matcher = pattern.matcher(text);

            List> result = new ArrayList<>();
            while (matcher.find()) {
                List matchResult = new ArrayList<>();
                for (int i = 0; i <= matcher.groupCount(); i++) {
                    matchResult.add(matcher.group(i));
                }
                result.add(matchResult);
            }
            return result;
        }
    }

    @UserFunction("apoc.text.join")
    @Description("Joins the given `STRING` values using the given delimiter.")
    public String join(final @Name("texts") List texts, final @Name("delimiter") String delimiter) {
        if (texts == null || delimiter == null) {
            return null;
        }
        return String.join(delimiter, texts);
    }

    @UserFunction("apoc.text.clean")
    @Description(
            "Strips the given `STRING` of everything except alpha numeric characters and converts it to lower case.")
    public String clean(final @Name("text") String text) {
        return text == null ? null : removeNonWordCharacters(text);
    }

    @UserFunction("apoc.text.compareCleaned")
    @Description(
            "Compares two given `STRING` values stripped of everything except alpha numeric characters converted to lower case.")
    public boolean compareCleaned(final @Name("text1") String text1, final @Name("text2") String text2) {
        if (text1 == null || text2 == null) {
            return false;
        }
        return removeNonWordCharacters(text1).equals(removeNonWordCharacters(text2));
    }

    @UserFunction("apoc.text.distance")
    @Description("Compares the two given `STRING` values using the Levenshtein distance algorithm.")
    public Long distance(final @Name("text1") String text1, @Name("text2") final String text2) {
        return levenshteinDistance(text1, text2);
    }

    @UserFunction("apoc.text.levenshteinDistance")
    @Description("Compares the given `STRING` values using the Levenshtein distance algorithm.")
    public Long levenshteinDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
        if (text1 == null || text2 == null) {
            return null;
        }
        return (long) levenshteinDistance.apply(text1, text2);
    }

    @UserFunction("apoc.text.levenshteinSimilarity")
    @Description(
            "Returns the similarity (a value within 0 and 1) between the two given `STRING` values based on the Levenshtein distance algorithm.")
    public Double levenshteinSimilarity(final @Name("text1") String text1, @Name("text2") final String text2) {
        if (text1 == null || text2 == null) {
            return null;
        }

        int longerLength = Math.max(text1.length(), text2.length());
        if (longerLength == 0) {
            return 1.0;
        }
        long editDistance = distance(text1, text2);
        return (longerLength - editDistance) / (double) longerLength;
    }

    @UserFunction("apoc.text.hammingDistance")
    @Description("Compares the two given `STRING` values using the Hamming distance algorithm.")
    public Long hammingDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
        if (text1 == null || text2 == null) {
            return null;
        }
        return (long) hammingDistance.apply(text1, text2);
    }

    @UserFunction("apoc.text.jaroWinklerDistance")
    @Description("Compares the two given `STRING` values using the Jaro-Winkler distance algorithm.")
    public Double jaroWinklerDistance(final @Name("text1") String text1, @Name("text2") final String text2) {
        if (text1 == null || text2 == null) {
            return null;
        }
        return jaroWinklerDistance.apply(text1, text2);
    }

    @UserFunction("apoc.text.sorensenDiceSimilarity")
    @Description(
            "Compares the two given `STRING` values using the Sørensen–Dice coefficient formula, with the provided IETF language tag.")
    public Double sorensenDiceSimilarity(
            final @Name("text1") String text1,
            final @Name("text2") String text2,
            final @Name(value = "languageTag", defaultValue = "en") String languageTag) {
        if (text1 == null || text2 == null || languageTag == null) {
            return null;
        }
        return SorensenDiceCoefficient.compute(text1, text2, languageTag);
    }

    @UserFunction("apoc.text.fuzzyMatch")
    @Description("Performs a fuzzy match search of the two given `STRING` values.")
    public Boolean fuzzyMatch(final @Name("text1") String text1, @Name("text2") final String text2) {
        if (text1 == null || text2 == null) {
            return null;
        }
        int termLength = text1.length();
        int maxDistanceAllowed = termLength < 3 ? 0 : termLength < 5 ? 1 : 2;

        Long distance = distance(text1, text2);

        return distance <= maxDistanceAllowed;
    }

    @UserFunction("apoc.text.urlencode")
    @Description("Encodes the given URL `STRING`.")
    public String urlencode(@Name("text") String text) {
        try {
            return URLEncoder.encode(text, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("urlencoding failed", e);
        }
    }

    @UserFunction("apoc.text.urldecode")
    @Description("Decodes the given URL encoded `STRING`.")
    public String urldecode(@Name("text") String text) {
        try {
            return URLDecoder.decode(text, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("urldecoding failed", e);
        }
    }

    private static Pattern cleanPattern = Pattern.compile("[^\\p{L}\\p{N}]+");
    private static Pattern specialCharPattern = Pattern.compile("\\p{IsM}+");
    private static String[][] UMLAUT_REPLACEMENTS = {
        {new String("Ä"), "Ae"},
        {new String("Ü"), "Ue"},
        {new String("Ö"), "Oe"},
        {new String("ä"), "ae"},
        {new String("ü"), "ue"},
        {new String("ö"), "oe"},
        {new String("ß"), "ss"}
    };

    private static String removeNonWordCharacters(String s) {

        String result = s;
        for (int i = 0; i < UMLAUT_REPLACEMENTS.length; i++) {
            result = result.replace(UMLAUT_REPLACEMENTS[i][0], UMLAUT_REPLACEMENTS[i][1]);
        }
        result = Normalizer.normalize(result, Normalizer.Form.NFD);
        String tmp2 = specialCharPattern.matcher(result).replaceAll("");
        return cleanPattern.matcher(tmp2).replaceAll("").toLowerCase();
    }

    @UserFunction("apoc.text.lpad")
    @Description("Left pads the given `STRING` by the given width.")
    public String lpad(
            @Name("text") String text,
            @Name("count") long count,
            @Name(value = "delimiter", defaultValue = " ") String delim) {
        int len = text.length();
        if (len >= count) return text;
        StringBuilder sb = new StringBuilder((int) count);
        char[] chars = new char[(int) count - len];
        Arrays.fill(chars, delim.charAt(0));
        sb.append(chars);
        sb.append(text);
        return sb.toString();
    }

    @UserFunction("apoc.text.rpad")
    @Description("Right pads the given `STRING` by the given width.")
    public String rpad(
            @Name("text") String text,
            @Name("count") long count,
            @Name(value = "delimiter", defaultValue = " ") String delim) {
        int len = text.length();
        if (len >= count) return text;
        StringBuilder sb = new StringBuilder(text);
        char[] chars = new char[(int) count - len];
        Arrays.fill(chars, delim.charAt(0));
        sb.append(chars);
        return sb.toString();
    }

    @UserFunction("apoc.text.format")
    @Description("Formats the given `STRING` with the given parameters.")
    public String format(
            @Name("text") String text,
            @Name("params") List