eu.interedition.collatex.dekker.DekkerAlgorithm Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of collatex-core Show documentation
A Java library for collating textual sources, for example, to produce an apparatus.
There is a newer version: 1.7.1
/*
 * Copyright (c) 2015 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see .
 */
package eu.interedition.collatex.dekker;

import eu.interedition.collatex.CollationAlgorithm;
import eu.interedition.collatex.Token;
import eu.interedition.collatex.VariantGraph;
import eu.interedition.collatex.Witness;
import eu.interedition.collatex.dekker.matrix.MatchTableLinker;
import eu.interedition.collatex.util.VariantGraphRanking;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

public class DekkerAlgorithm extends CollationAlgorithm.Base {

    private final Comparator comparator;
    private final TokenLinker tokenLinker;
    private final PhraseMatchDetector phraseMatchDetector;
    private final TranspositionDetector transpositionDetector;
    private Map tokenLinks;
    private List> phraseMatches;
    private List> transpositions;
    private Map alignments;
    private boolean mergeTranspositions = false;

    public DekkerAlgorithm(Comparator comparator) {
        this(comparator, new MatchTableLinker());
    }

    public DekkerAlgorithm(Comparator comparator, TokenLinker tokenLinker) {
        this.comparator = comparator;
        this.tokenLinker = tokenLinker;
        this.phraseMatchDetector = new PhraseMatchDetector();
        this.transpositionDetector = new TranspositionDetector();
    }

    @Override
    public void collate(VariantGraph graph, Iterable tokens) {
        final Witness witness = StreamSupport.stream(tokens.spliterator(), false)
            .findFirst()
            .map(Token::getWitness)
            .orElseThrow(() -> new IllegalArgumentException("Empty witness"));

        if (LOG.isLoggable(Level.FINER)) {
            LOG.log(Level.FINER, "{0} + {1}: {2} vs. {3}", new Object[]{graph, witness, graph.vertices(), tokens});
        }

        if (LOG.isLoggable(Level.FINE)) {
            LOG.log(Level.FINE, "{0} + {1}: Match and link tokens", new Object[]{graph, witness});
        }
        tokenLinks = tokenLinker.link(graph, tokens, comparator);

        if (LOG.isLoggable(Level.FINER)) {
            for (Map.Entry tokenLink : tokenLinks.entrySet()) {
                LOG.log(Level.FINER, "{0} + {1}: Token match: {2} = {3}", new Object[]{graph, witness, tokenLink.getValue(), tokenLink.getKey()});
            }
        }

        if (LOG.isLoggable(Level.FINE)) {
            LOG.log(Level.FINE, "{0} + {1}: Detect phrase matches", new Object[]{graph, witness});
        }
        phraseMatches = phraseMatchDetector.detect(tokenLinks, graph, tokens);
        if (LOG.isLoggable(Level.FINER)) {
            for (List phraseMatch : phraseMatches) {
                LOG.log(Level.FINER, "{0} + {1}: Phrase match: {2}", new Object[]{graph, witness, phraseMatch});
            }
        }

        if (LOG.isLoggable(Level.FINE)) {
            LOG.log(Level.FINE, "{0} + {1}: Detect transpositions", new Object[]{graph, witness});
        }
        transpositions = transpositionDetector.detect(phraseMatches, graph);
        if (LOG.isLoggable(Level.FINE)) {
            LOG.log(Level.FINE, "transpositions:{0}", transpositions);
        }

        if (LOG.isLoggable(Level.FINER)) {
            for (List transposition : transpositions) {
                LOG.log(Level.FINER, "{0} + {1}: Transposition: {2}", new Object[]{graph, witness, transposition});
            }
        }

        if (LOG.isLoggable(Level.FINE)) {
            LOG.log(Level.FINE, "{0} + {1}: Determine aligned tokens by filtering transpositions", new Object[]{graph, witness});
        }
        alignments = new HashMap<>();
        for (List phrase : phraseMatches) {
            for (Match match : phrase) {
                alignments.put(match.token, match.vertex);
            }
        }

        for (List transposedPhrase : transpositions) {
            for (Match match : transposedPhrase) {
                alignments.remove(match.token);
            }
        }
        if (LOG.isLoggable(Level.FINER)) {
            for (Map.Entry alignment : alignments.entrySet()) {
                LOG.log(Level.FINER, "{0} + {1}: Alignment: {2} = {3}", new Object[]{graph, witness, alignment.getValue(), alignment.getKey()});
            }
        }

        merge(graph, tokens, alignments);

        // we filter out small transposed phrases over large distances
        List> falseTranspositions = new ArrayList<>();

        VariantGraphRanking ranking = VariantGraphRanking.of(graph);

        for (List transposedPhrase : transpositions) {
            Match match = transposedPhrase.get(0);
            VariantGraph.Vertex v1 = witnessTokenVertices.get(match.token);
            VariantGraph.Vertex v2 = match.vertex;
            int distance = Math.abs(ranking.apply(v1) - ranking.apply(v2)) - 1;
            if (distance > transposedPhrase.size() * 3) {
                falseTranspositions.add(transposedPhrase);
            }
        }

        for (List transposition : falseTranspositions) {
            transpositions.remove(transposition);
        }

        if (mergeTranspositions) {
            mergeTranspositions(graph, transpositions);
        }

        if (LOG.isLoggable(Level.FINER)) {
            LOG.log(Level.FINER, "!{0}: {1}", new Object[]{graph, StreamSupport.stream(graph.vertices().spliterator(), false).map(Object::toString).collect(Collectors.joining(", "))});
        }
    }

    public Map getTokenLinks() {
        return tokenLinks;
    }

    public List> getPhraseMatches() {
        return Collections.unmodifiableList(phraseMatches);
    }

    public List> getTranspositions() {
        return Collections.unmodifiableList(transpositions);
    }

    public Map getAlignments() {
        return Collections.unmodifiableMap(alignments);
    }

    /*
     * This check disables transposition rendering in the variant
     * graph when the variant graph contains more then two witnesses.
     * Transposition detection is done in a progressive manner
     * (witness by witness). When viewing the resulting graph
     * containing the variation for all witnesses
     * the detected transpositions can look strange, since segments
     * may have split into smaller or larger parts.
     */
    public void setMergeTranspositions(boolean b) {
        this.mergeTranspositions = b;
    }
}