eu.interedition.collatex.util.GreedyStringTilingAlgorithm Maven / Gradle / Ivy
Show all versions of collatex-core Show documentation
/*
* Copyright (c) 2015 The Interedition Development Group.
*
* This file is part of CollateX.
*
* CollateX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* CollateX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with CollateX. If not, see .
*/
package eu.interedition.collatex.util;
import eu.interedition.collatex.CollationAlgorithm;
import eu.interedition.collatex.Token;
import eu.interedition.collatex.VariantGraph;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.stream.StreamSupport;
/**
* Greedy String Tiling.
*
* Implements the Greedy String Tiling algorithm as proposed by Michael J. Wise in his paper:
* "String Similarity via Greedy String Tiling and Running Karp-Rabin Matching"
*
* @author Arno Mittelbach
* @author Lasse Lehmann
* @author Gregor Middell
*/
public class GreedyStringTilingAlgorithm extends CollationAlgorithm.Base {
private final Comparator comparator;
private final int minimumTileLength;
private final Equality equality = new Equality() {
@Override
public boolean isEqual(VariantGraph.Vertex[] a, Token b) {
for (VariantGraph.Vertex vertex : a) {
final Set tokens = vertex.tokens();
if (!tokens.isEmpty() && comparator.compare(tokens.stream().findFirst().get(), b) == 0) {
return true;
}
}
return false;
}
};
public GreedyStringTilingAlgorithm(Comparator comparator, int minimumTileLength) {
this.comparator = comparator;
this.minimumTileLength = minimumTileLength;
}
@Override
public void collate(VariantGraph graph, Iterable witness) {
final VariantGraph.Vertex[][] vertices = VariantGraphRanking.of(graph).asArray();
final Token[] tokens = StreamSupport.stream(witness.spliterator(), false).toArray(Token[]::new);
final SortedSet> matches = new TreeSet<>(VertexMatch.setComparator());
for (Match match : match(vertices, tokens, equality, minimumTileLength)) {
final SortedSet phrase = new TreeSet<>();
for (int mc = 0, ml = match.length; mc < ml; mc++) {
final int rank = match.left + mc;
phrase.add(new VertexMatch.WithTokenIndex(vertices[rank][0], rank, match.right + mc));
}
matches.add(phrase);
}
merge(graph, vertices, tokens, matches);
}
public static SortedSet match(A[] left, B[] right, Equality equality, int minimumTileLength) {
final boolean[] markedLeft = new boolean[left.length];
final boolean[] markedRight = new boolean[right.length];
Arrays.fill(markedLeft, false);
Arrays.fill(markedRight, false);
final SortedSet matches = new TreeSet<>();
final Map> matchesByLength = new HashMap<>();
int maxMatchLength;
do {
maxMatchLength = minimumTileLength;
for (int rc = 0; rc < right.length; rc++) {
for (int lc = 0; lc < left.length; lc++) {
int matchLength = 0;
for (int tc = 0;
(tc + lc) < left.length && (tc + rc) < right.length &&
!markedLeft[lc + tc] && !markedRight[rc + tc] &&
equality.isEqual(left[lc + tc], right[rc + tc]);
tc++) {
matchLength++;
}
if (matchLength >= maxMatchLength) {
List theMatches = matchesByLength.get(matchLength);
if (theMatches == null) {
matchesByLength.put(matchLength, theMatches = new ArrayList<>());
}
theMatches.add(new Match(lc, rc));
}
if (matchLength > maxMatchLength) {
maxMatchLength = matchLength;
}
}
}
for (Match match : matchesByLength.getOrDefault(maxMatchLength, Collections.emptyList())) {
boolean occluded = false;
for (int tc = 0; tc < maxMatchLength; tc++) {
if (markedLeft[match.left + tc] || markedRight[match.right + tc]) {
occluded = true;
break;
}
}
if (!occluded) {
for (int tc = 0; tc < maxMatchLength; tc++) {
markedLeft[match.left + tc] = true;
markedRight[match.right + tc] = true;
}
matches.add(new Match(match.left, match.right, maxMatchLength));
}
}
} while (maxMatchLength > minimumTileLength);
return matches;
}
public static interface Equality {
boolean isEqual(A a, B b);
}
public static class Match implements Comparable {
public final int left;
public final int right;
public final int length;
public Match(int left, int right, int length) {
this.left = left;
this.right = right;
this.length = length;
}
public Match(int left, int right) {
this(left, right, 0);
}
@Override
public boolean equals(Object obj) {
if (obj != null && obj instanceof Match) {
return (left == ((Match) obj).left);
}
return super.equals(obj);
}
@Override
public int hashCode() {
return left;
}
@Override
public int compareTo(Match o) {
return left - o.left;
}
}
}