apoc.text.SorensenDiceCoefficient Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of apoc-core Show documentation
Show all versions of apoc-core Show documentation
Core package for Neo4j Procedures
/*
* Copyright (c) "Neo4j"
* Neo4j Sweden AB [http://neo4j.com]
*
* This file is part of Neo4j.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package apoc.text;
import static java.util.stream.Collectors.toList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class SorensenDiceCoefficient {
private static final double HIGHEST_SCORE = 1.0;
public static double compute(String input1, String input2) {
return compute(input1, input2, Locale.ENGLISH);
}
public static double compute(String input1, String input2, String languageTag) {
return compute(input1, input2, Locale.forLanguageTag(languageTag));
}
private static double compute(String input1, String input2, Locale locale) {
if (input1.equals(input2)) {
return HIGHEST_SCORE;
}
List words1 = normalizedWords(input1, locale);
List words2 = normalizedWords(input2, locale);
if (words1.equals(words2)) {
return HIGHEST_SCORE;
}
List bigrams1 = allSortedBigrams(words1);
List bigrams2 = allSortedBigrams(words2);
int index1 = 0, index2 = 0, matches = 0;
while (index1 < bigrams1.size() && index2 < bigrams2.size()) {
Bigram bigram1 = bigrams1.get(index1);
Bigram bigram2 = bigrams2.get(index2);
if (bigram1.equals(bigram2)) {
matches++;
index1++;
index2++;
continue;
}
if (bigram1.lessThan(bigram2)) {
index1++;
continue;
}
index2++;
}
return 2.0 * matches / (bigrams1.size() + bigrams2.size());
}
private static List allSortedBigrams(List words) {
return words.stream().flatMap(s -> toStream(s.toCharArray())).sorted().collect(toList());
}
private static Stream toStream(char[] chars) {
return IntStream.range(0, chars.length - 1).mapToObj(i -> new Bigram(chars[i], chars[i + 1]));
}
private static List normalizedWords(String text1, Locale locale) {
return Arrays.asList(text1.trim().toUpperCase(locale).split("\\s+"));
}
private static class Bigram implements Comparable {
private final char first;
private final char second;
public Bigram(char first, char second) {
this.first = first;
this.second = second;
}
@Override
public int hashCode() {
return Objects.hash(first, second);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
final Bigram other = (Bigram) obj;
return Objects.equals(this.first, other.first) && Objects.equals(this.second, other.second);
}
public boolean lessThan(Bigram other) {
if (this.first < other.first) {
return true;
}
return this.first == other.first && this.second < other.second;
}
@Override
public String toString() {
return String.format("[%c,%c]", first, second);
}
@Override
public int compareTo(Bigram other) {
if (this == other) {
return 0;
}
if (this.first == other.first && this.second == other.second) {
return 0;
}
if (this.lessThan(other)) {
return -1;
}
return 1;
}
}
}