eu.interedition.collatex.CollationAlgorithm Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of collatex-core Show documentation
Show all versions of collatex-core Show documentation
A Java library for collating textual sources, for example, to produce an apparatus.
/*
* Copyright (c) 2015 The Interedition Development Group.
*
* This file is part of CollateX.
*
* CollateX is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* CollateX is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with CollateX. If not, see .
*/
package eu.interedition.collatex;
import eu.interedition.collatex.dekker.Match;
import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm;
import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer;
import eu.interedition.collatex.util.VertexMatch;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* @author Gregor Middell
*/
public interface CollationAlgorithm {
void collate(VariantGraph against, Iterable witness);
void collate(VariantGraph against, Iterable... witnesses);
void collate(VariantGraph against, List extends Iterable> witnesses);
abstract class Base implements CollationAlgorithm {
protected final Logger LOG = Logger.getLogger(getClass().getName());
protected Map witnessTokenVertices;
@Override
public void collate(VariantGraph against, Iterable... witnesses) {
collate(against, Arrays.asList(witnesses));
}
@Override
public void collate(VariantGraph against, List extends Iterable> witnesses) {
for (Iterable witness : witnesses) {
if (LOG.isLoggable(Level.FINE)) {
LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[]{
Runtime.getRuntime().totalMemory(),
Runtime.getRuntime().maxMemory()
});
}
collate(against, witness);
}
}
protected void merge(VariantGraph into, Iterable witnessTokens, Map alignments) {
final Witness witness = StreamSupport.stream(witnessTokens.spliterator(), false)
.findFirst()
.map(Token::getWitness)
.orElseThrow(() -> new IllegalArgumentException("Empty witness"));
if (LOG.isLoggable(Level.FINE)) {
LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[]{into, witness});
}
witnessTokenVertices = new HashMap<>();
VariantGraph.Vertex last = into.getStart();
final Set witnessSet = Collections.singleton(witness);
for (Token token : witnessTokens) {
VariantGraph.Vertex matchingVertex = alignments.get(token);
if (matchingVertex == null) {
matchingVertex = into.add(token);
} else {
if (LOG.isLoggable(Level.FINE)) {
LOG.log(Level.FINE, "Match: {0} to {1}", new Object[]{matchingVertex, token});
}
matchingVertex.add(Collections.singleton(token));
}
witnessTokenVertices.put(token, matchingVertex);
into.connect(last, matchingVertex, witnessSet);
last = matchingVertex;
}
into.connect(last, into.getEnd(), witnessSet);
}
protected void mergeTranspositions(VariantGraph into, Iterable> transpositions) {
for (SortedSet transposedPhrase : transpositions) {
if (LOG.isLoggable(Level.FINE)) {
LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase);
}
final Set transposed = new HashSet<>();
for (VertexMatch.WithToken match : transposedPhrase) {
transposed.add(witnessTokenVertices.get(match.token));
transposed.add(match.vertex);
}
into.transpose(transposed);
}
}
protected void mergeTranspositions(VariantGraph into, List> transpositions) {
for (List transposedPhrase : transpositions) {
if (LOG.isLoggable(Level.FINE)) {
LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase);
}
final Set transposed = new HashSet<>();
for (Match match : transposedPhrase) {
transposed.add(witnessTokenVertices.get(match.token));
transposed.add(match.vertex);
}
into.transpose(transposed);
}
}
protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet> matches) {
@SuppressWarnings("unchecked")
final SortedSet[] matchesVertexOrder = matches.toArray(new SortedSet[matches.size()]);
final SortedSet[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length);
Arrays.sort(matchesTokenOrder, Comparator.comparing(m -> m.first().token));
final Set> alignedMatches = NeedlemanWunschAlgorithm.align(
matchesVertexOrder,
matchesTokenOrder,
new MatchPhraseAlignmentScorer(Math.max(tokens.length, vertices.length))
).keySet();
final Map alignments = matches.stream()
.filter(alignedMatches::contains)
.flatMap(Set::stream)
.collect(Collectors.toMap(m -> tokens[m.token], m -> m.vertex));
final List> transpositions = matches.stream()
.filter(m -> !alignedMatches.contains(m))
.map(t -> t.stream().map(m -> new VertexMatch.WithToken(m.vertex, m.vertexRank, tokens[m.token])).collect(Collectors.toCollection(TreeSet::new)))
.collect(Collectors.toList());
merge(graph, Arrays.asList(tokens), alignments);
mergeTranspositions(graph, transpositions);
}
}
static class MatchPhraseAlignmentScorer implements NeedlemanWunschScorer, SortedSet> {
private final int maxWitnessLength;
public MatchPhraseAlignmentScorer(int maxWitnessLength) {
this.maxWitnessLength = maxWitnessLength;
}
@Override
public float score(SortedSet a, SortedSet b) {
return (a.equals(b) ? 1 : -maxWitnessLength);
}
@Override
public float gap() {
return -(1 / (maxWitnessLength * 1.0f));
}
}
}