All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.CollationAlgorithm Maven / Gradle / Ivy

Go to download

A Java library for collating textual sources, for example, to produce an apparatus.

There is a newer version: 1.7.1
Show newest version
/*
 * Copyright (c) 2015 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see .
 */

package eu.interedition.collatex;

import eu.interedition.collatex.dekker.Match;
import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschAlgorithm;
import eu.interedition.collatex.needlemanwunsch.NeedlemanWunschScorer;
import eu.interedition.collatex.util.VertexMatch;

import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
 * @author Gregor Middell
 */
public interface CollationAlgorithm {

    void collate(VariantGraph against, Iterable witness);

    void collate(VariantGraph against, Iterable... witnesses);

    void collate(VariantGraph against, List> witnesses);

    abstract class Base implements CollationAlgorithm {
        protected final Logger LOG = Logger.getLogger(getClass().getName());
        protected Map witnessTokenVertices;

        @Override
        public void collate(VariantGraph against, Iterable... witnesses) {
            collate(against, Arrays.asList(witnesses));
        }

        @Override
        public void collate(VariantGraph against, List> witnesses) {
            for (Iterable witness : witnesses) {
                if (LOG.isLoggable(Level.FINE)) {
                    LOG.log(Level.FINE, "heap space: {0}/{1}", new Object[]{
                        Runtime.getRuntime().totalMemory(),
                        Runtime.getRuntime().maxMemory()
                    });
                }
                collate(against, witness);
            }
        }

        protected void merge(VariantGraph into, Iterable witnessTokens, Map alignments) {
            final Witness witness = StreamSupport.stream(witnessTokens.spliterator(), false)
                .findFirst()
                .map(Token::getWitness)
                .orElseThrow(() -> new IllegalArgumentException("Empty witness"));

            if (LOG.isLoggable(Level.FINE)) {
                LOG.log(Level.FINE, "{0} + {1}: Merge comparand into graph", new Object[]{into, witness});
            }
            witnessTokenVertices = new HashMap<>();
            VariantGraph.Vertex last = into.getStart();
            final Set witnessSet = Collections.singleton(witness);
            for (Token token : witnessTokens) {
                VariantGraph.Vertex matchingVertex = alignments.get(token);
                if (matchingVertex == null) {
                    matchingVertex = into.add(token);
                } else {
                    if (LOG.isLoggable(Level.FINE)) {
                        LOG.log(Level.FINE, "Match: {0} to {1}", new Object[]{matchingVertex, token});
                    }
                    matchingVertex.add(Collections.singleton(token));
                }
                witnessTokenVertices.put(token, matchingVertex);

                into.connect(last, matchingVertex, witnessSet);
                last = matchingVertex;
            }
            into.connect(last, into.getEnd(), witnessSet);
        }

        protected void mergeTranspositions(VariantGraph into, Iterable> transpositions) {
            for (SortedSet transposedPhrase : transpositions) {
                if (LOG.isLoggable(Level.FINE)) {
                    LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase);
                }
                final Set transposed = new HashSet<>();
                for (VertexMatch.WithToken match : transposedPhrase) {
                    transposed.add(witnessTokenVertices.get(match.token));
                    transposed.add(match.vertex);
                }
                into.transpose(transposed);
            }
        }

        protected void mergeTranspositions(VariantGraph into, List> transpositions) {
            for (List transposedPhrase : transpositions) {
                if (LOG.isLoggable(Level.FINE)) {
                    LOG.log(Level.FINE, "Transposition: {0}", transposedPhrase);
                }
                final Set transposed = new HashSet<>();
                for (Match match : transposedPhrase) {
                    transposed.add(witnessTokenVertices.get(match.token));
                    transposed.add(match.vertex);
                }
                into.transpose(transposed);
            }
        }

        protected void merge(VariantGraph graph, VariantGraph.Vertex[][] vertices, Token[] tokens, SortedSet> matches) {
            @SuppressWarnings("unchecked")
            final SortedSet[] matchesVertexOrder = matches.toArray(new SortedSet[matches.size()]);
            final SortedSet[] matchesTokenOrder = Arrays.copyOf(matchesVertexOrder, matchesVertexOrder.length);

            Arrays.sort(matchesTokenOrder, Comparator.comparing(m -> m.first().token));

            final Set> alignedMatches = NeedlemanWunschAlgorithm.align(
                matchesVertexOrder,
                matchesTokenOrder,
                new MatchPhraseAlignmentScorer(Math.max(tokens.length, vertices.length))
            ).keySet();

            final Map alignments = matches.stream()
                .filter(alignedMatches::contains)
                .flatMap(Set::stream)
                .collect(Collectors.toMap(m -> tokens[m.token], m -> m.vertex));

            final List> transpositions = matches.stream()
                .filter(m -> !alignedMatches.contains(m))
                .map(t -> t.stream().map(m -> new VertexMatch.WithToken(m.vertex, m.vertexRank, tokens[m.token])).collect(Collectors.toCollection(TreeSet::new)))
                .collect(Collectors.toList());

            merge(graph, Arrays.asList(tokens), alignments);
            mergeTranspositions(graph, transpositions);
        }
    }

    static class MatchPhraseAlignmentScorer implements NeedlemanWunschScorer, SortedSet> {

        private final int maxWitnessLength;

        public MatchPhraseAlignmentScorer(int maxWitnessLength) {
            this.maxWitnessLength = maxWitnessLength;
        }

        @Override
        public float score(SortedSet a, SortedSet b) {
            return (a.equals(b) ? 1 : -maxWitnessLength);
        }

        @Override
        public float gap() {
            return -(1 / (maxWitnessLength * 1.0f));
        }

    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy