
eu.interedition.collatex.medite.Matches Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of collatex-core Show documentation
Show all versions of collatex-core Show documentation
A Java library for collating textual sources, for example, to produce an apparatus.
The newest version!
/*
* Copyright (c) 2015 The Interedition Development Group.
*
* This file is part of CollateX.
*
* CollateX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* CollateX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with CollateX. If not, see .
*/
package eu.interedition.collatex.medite;
import eu.interedition.collatex.Token;
import eu.interedition.collatex.VariantGraph;
import eu.interedition.collatex.util.VertexMatch;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* @author Gregor Middell
*/
public class Matches extends ArrayList> {
public Matches(int initialCapacity) {
super(initialCapacity);
}
public static Matches between(VariantGraph.Vertex[][] vertices, SuffixTree suffixTree, Function, Integer> matchEvaluator) {
final Map> matchThreads = new HashMap<>();
for (int rank = 0; rank < vertices.length; rank++) {
for (VariantGraph.Vertex vertex : vertices[rank]) {
final MatchThreadElement matchThreadElement = new MatchThreadElement(suffixTree).advance(vertex, rank);
if (matchThreadElement != null) {
matchThreads.computeIfAbsent(rank, r -> new LinkedList<>()).add(matchThreadElement);
}
}
for (MatchThreadElement matchThreadElement : matchThreads.getOrDefault(rank - 1, Collections.emptyList())) {
for (VariantGraph.Vertex vertex : vertices[rank]) {
final MatchThreadElement advanced = matchThreadElement.advance(vertex, rank);
if (advanced != null) {
matchThreads.computeIfAbsent(rank, r -> new LinkedList<>()).add(advanced);
}
}
}
}
final Matches matches = new Matches(matchThreads.size());
matchThreads.values().stream().flatMap(List::stream).forEach(matchThreadElement -> {
final List> threadPhrases = new ArrayList<>();
boolean firstElement = true;
for (MatchThreadElement threadElement : matchThreadElement.thread()) {
final SuffixTree.EquivalenceClass equivalenceClass = threadElement.cursor.matchedClass();
for (int mc = 0; mc < equivalenceClass.length; mc++) {
final int tokenCandidate = equivalenceClass.members[mc];
if (firstElement) {
final SortedSet phrase = new TreeSet<>();
phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate));
threadPhrases.add(phrase);
} else {
for (SortedSet phrase : threadPhrases) {
if ((phrase.last().token + 1) == tokenCandidate) {
phrase.add(new VertexMatch.WithTokenIndex(threadElement.vertex, threadElement.vertexRank, tokenCandidate));
}
}
}
}
firstElement = false;
}
matches.addAll(threadPhrases);
});
Collections.sort(matches, maximalUniqueMatchOrdering(matchEvaluator));
return matches;
}
private static Comparator> maximalUniqueMatchOrdering(final Function, Integer> matchEvaluator) {
return new Comparator>() {
@Override
public int compare(SortedSet o1, SortedSet o2) {
// 1. reverse ordering by match value
int result = matchEvaluator.apply(o2) - matchEvaluator.apply(o1);
if (result != 0) {
return result;
}
final VertexMatch.WithTokenIndex firstMatch1 = o1.first();
final VertexMatch.WithTokenIndex firstMatch2 = o2.first();
// 2. ordering by match distance
result = (Math.abs(firstMatch1.token - firstMatch1.vertexRank) - Math.abs(firstMatch2.token - firstMatch2.vertexRank));
if (result != 0) {
return result;
}
// 3. ordering by first vertex ranking
result = firstMatch1.vertexRank - firstMatch2.vertexRank;
if (result != 0) {
return result;
}
// 3. ordering by first token index
return firstMatch1.token - firstMatch2.token;
}
};
}
public SortedSet> findMaximalUniqueMatches() {
final List> allMatches = new ArrayList<>(this);
final SortedSet> maximalUniqueMatches = new TreeSet<>(VertexMatch.setComparator());
while (true) {
SortedSet nextMum = null;
SortedSet candidate = null;
for (SortedSet successor : allMatches) {
if (candidate == null) {
continue;
}
if (candidate.size() > successor.size() || candidate.first().token == successor.first().token) {
nextMum = candidate;
break;
}
candidate = successor;
}
if (nextMum == null) {
nextMum = allMatches.stream().findFirst().orElse(null);
}
if (nextMum == null) {
break;
}
if (!maximalUniqueMatches.add(nextMum)) {
throw new IllegalStateException("Duplicate MUM");
}
final BitSet rankFilter = new BitSet();
final BitSet tokenFilter = new BitSet();
rankFilter.set(nextMum.first().vertexRank, nextMum.last().vertexRank + 1);
tokenFilter.set(nextMum.first().token, nextMum.last().token + 1);
allMatches.removeIf(VertexMatch.filter(rankFilter, tokenFilter));
}
return maximalUniqueMatches;
}
/**
* @author Gregor Middell
*/
static class MatchThreadElement {
final MatchThreadElement previous;
final VariantGraph.Vertex vertex;
final int vertexRank;
final SuffixTree.Cursor cursor;
MatchThreadElement(SuffixTree suffixTree) {
this(null, null, -1, suffixTree.cursor());
}
MatchThreadElement(MatchThreadElement previous, VariantGraph.Vertex vertex, int vertexRank, SuffixTree.Cursor cursor) {
this.previous = previous;
this.vertex = vertex;
this.vertexRank = vertexRank;
this.cursor = cursor;
}
MatchThreadElement advance(VariantGraph.Vertex vertex, int vertexRank) {
final Set tokens = vertex.tokens();
if (!tokens.isEmpty()) {
final SuffixTree.Cursor next = cursor.move(tokens.stream().findFirst().get());
if (next != null) {
return new MatchThreadElement(this, vertex, vertexRank, next);
}
}
return null;
}
List thread() {
final LinkedList thread = new LinkedList<>();
MatchThreadElement current = this;
while (current.vertex != null) {
thread.addFirst(current);
current = current.previous;
}
return thread;
}
@Override
public String toString() {
return "[" + Arrays.asList(vertexRank, vertex, cursor.matchedClass()).stream().map(Object::toString).collect(Collectors.joining(", ")) + "]";
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy