All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.simple.SimpleVariantGraphSerializer Maven / Gradle / Ivy

Go to download

A Java library for collating textual sources, for example, to produce an apparatus.

There is a newer version: 1.7.1
Show newest version
/*
 * Copyright (c) 2015 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see .
 */

package eu.interedition.collatex.simple;

import eu.interedition.collatex.Token;
import eu.interedition.collatex.VariantGraph;
import eu.interedition.collatex.Witness;
import eu.interedition.collatex.dekker.Tuple;
import eu.interedition.collatex.util.ParallelSegmentationApparatus;
import eu.interedition.collatex.util.VariantGraphRanking;

import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Function;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

/**
 * @author Gregor Middell
 */
public class SimpleVariantGraphSerializer {
    /**
     * CollateX custom namespace.
     */
    protected static final String COLLATEX_NS = "http://interedition.eu/collatex/ns/1.0";

    /**
     * The TEI P5 namespace.
     */
    protected static final String TEI_NS = "http://www.tei-c.org/ns/1.0";

    private final VariantGraph graph;
    private final Function, String> tokensToString;
    private final Map vertexIds = new HashMap<>();
    private VariantGraphRanking ranking;

    public SimpleVariantGraphSerializer(VariantGraph graph) {
        this(graph, SIMPLE_TOKEN_TO_STRING);
    }

    public SimpleVariantGraphSerializer(VariantGraph graph, Function, String> tokensToString) {
        this.graph = graph;
        this.tokensToString = tokensToString;
    }

    public void toTEI(final XMLStreamWriter xml) throws XMLStreamException {
        try {
            ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() {
                @Override
                public void start() {
                    try {
                        xml.writeStartElement("cx", "apparatus", COLLATEX_NS);
                        xml.writeNamespace("cx", COLLATEX_NS);
                        xml.writeNamespace("", TEI_NS);
                    } catch (XMLStreamException e) {
                        throw new RuntimeException(e);
                    }
                }

                @Override
                public void segment(SortedMap> contents) {
                    final Map> segments = new LinkedHashMap<>();
                    contents.forEach((witness, tokens) -> segments.computeIfAbsent(tokensToString.apply(tokens).trim(), k -> new HashSet<>()).add(witness));

                    final Set segmentContents = segments.keySet();
                    try {
                        if (segmentContents.size() == 1) {
                            xml.writeCharacters(segmentContents.stream().findFirst().get());
                        } else {
                            xml.writeStartElement("", "app", TEI_NS);
                            for (String segment : segmentContents) {
                                final StringBuilder witnesses = new StringBuilder();
                                for (Witness witness : segments.get(segment)) {
                                    witnesses.append(witness.getSigil()).append(" ");
                                }
                                if (segment.length() == 0) {
                                    xml.writeEmptyElement("", "rdg", TEI_NS);
                                } else {
                                    xml.writeStartElement("", "rdg", TEI_NS);
                                }

                                xml.writeAttribute("wit", witnesses.toString().trim());

                                if (segment.length() > 0) {
                                    xml.writeCharacters(segment);
                                    xml.writeEndElement();
                                }
                            }
                            xml.writeEndElement();
                        }
                    } catch (XMLStreamException e) {
                        throw new RuntimeException(e);
                    }
                }

                @Override
                public void end() {
                    try {
                        xml.writeEndElement();
                    } catch (XMLStreamException e) {
                        throw new RuntimeException(e);
                    }
                }
            });
        } catch (RuntimeException re) {
            Throwable rootCause = re;
            for (Throwable cause = re; cause != null; cause = cause.getCause()) {
                rootCause = cause;
            }
            if (rootCause instanceof XMLStreamException) {
                throw (XMLStreamException) rootCause;
            }
            throw re;
        }
    }

    public void toCsv(final Writer out) throws IOException {
        try {
            ParallelSegmentationApparatus.generate(ranking(), new ParallelSegmentationApparatus.GeneratorCallback() {
                @Override
                public void start() {
                    try {
                        for (Iterator it = graph.witnesses().stream().sorted(Witness.SIGIL_COMPARATOR).iterator(); it.hasNext(); ) {
                            out.write(escapeCsvField(it.next().getSigil()));
                            if (it.hasNext()) {
                                out.write(",");
                            }
                        }
                        out.write("\r\n");
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }

                @Override
                public void segment(SortedMap> contents) {
                    try {
                        for (Iterator witnessIt = contents.keySet().iterator(); witnessIt.hasNext(); ) {
                            out.write(escapeCsvField(tokensToString.apply(contents.getOrDefault(witnessIt.next(), Collections.emptySet()))));
                            if (witnessIt.hasNext()) {
                                out.write(",");
                            }
                        }
                        out.write("\r\n");
                    } catch (IOException e) {
                        throw new RuntimeException(e);
                    }
                }

                @Override
                public void end() {
                }
            });
        } catch (Throwable t) {
            for (Throwable cause = t; cause != null; cause = cause.getCause()) {
                if (cause instanceof IOException) {
                    throw (IOException) cause;
                }
            }
            if (t instanceof RuntimeException) {
                throw (RuntimeException) t;
            }
            throw new RuntimeException(t);
        }
    }

    static final Pattern CSV_SPECIAL_CHARS = Pattern.compile("[\r\n\",]");

    static String escapeCsvField(String content) {
        return (CSV_SPECIAL_CHARS.matcher(content).find() ? ("\"" + content.replaceAll("\"", "\"\"") + "\"") : content);
    }

    public void toDot(Writer writer) {
        final PrintWriter out = new PrintWriter(writer);
        final String indent = "  ";
        final String connector = " -> ";

        out.println("digraph G {");

        for (VariantGraph.Vertex v : graph.vertices()) {
            out.print(indent + id(v));
            out.print(" [label = \"" + toDotLabel(v) + "\"]");
            out.println(";");
        }

        for (VariantGraph.Vertex v : graph.vertices()) {
            for (Map.Entry> e : v.outgoing().entrySet()) {
                out.print(indent + id(v) + connector + id(e.getKey()));
                out.print(" [label = \"" + toDotLabel(e.getValue()) + "\"]");
                out.println(";");
            }
        }

        for (Tuple transposedTuple : transposedTuples()) {
            final String leftId = id(transposedTuple.left);
            final String rightId = id(transposedTuple.right);
            out.print(indent + leftId + connector + rightId);
            out.print(" [ color = \"lightgray\", style = \"dashed\" arrowhead = \"none\", arrowtail = \"none\" ]");
            out.println(";");
        }

        out.print(indent + id(graph.getStart()) + connector + id(graph.getEnd()));
        out.print(" [color =  \"white\"]");
        out.println(";");

        out.println("}");

        out.flush();
    }

    private String id(VariantGraph.Vertex vertex) {
        return ("v" + numericId(vertex));
    }

    private int numericId(VariantGraph.Vertex vertex) {
        Integer id = vertexIds.get(vertex);
        if (id == null) {
            id = vertexIds.size();
            vertexIds.put(vertex, id);
        }
        return id;
    }

    String toDotLabel(Set e) {
        return escapeDotLabel(e.stream().map(Witness::getSigil).distinct().sorted().collect(Collectors.joining(", ")));
    }

    String toDotLabel(VariantGraph.Vertex v) {
        return escapeDotLabel(vertexToString.apply(v));
    }

    static String escapeDotLabel(String string) {
        return string.replaceAll("\"", "\\\\\"").replaceAll("[\n\r]+", "\u00B6");
    }

    VariantGraphRanking ranking() {
        if (ranking == null) {
            ranking = VariantGraphRanking.of(graph);
        }
        return ranking;
    }

    Set> transposedTuples() {
        final Set> tuples = new HashSet<>();
        final Comparator vertexOrdering = ranking().comparator();

        for (Set transposition : graph.transpositions()) {
            final SortedMap> verticesByWitness = new TreeMap<>(Witness.SIGIL_COMPARATOR);
            for (VariantGraph.Vertex vertex : transposition) {
                for (Witness witness : vertex.witnesses()) {
                    verticesByWitness.computeIfAbsent(witness, w -> new TreeSet<>(vertexOrdering)).add(vertex);
                }
            }

            Witness prev = null;
            for (Witness witness : verticesByWitness.keySet()) {
                if (prev != null) {
                    final Iterator prevIt = verticesByWitness.get(prev).iterator();
                    final Iterator nextIt = verticesByWitness.get(witness).iterator();
                    while (prevIt.hasNext() && nextIt.hasNext()) {
                        final VariantGraph.Vertex prevVertex = prevIt.next();
                        final VariantGraph.Vertex nextVertex = nextIt.next();
                        if (!prevVertex.equals(nextVertex)) {
                            tuples.add(new Tuple<>(prevVertex, nextVertex));
                        }
                    }
                }
                prev = witness;
            }
        }

        return tuples;
    }

    public void toGraphML(XMLStreamWriter xml) throws XMLStreamException {
        xml.writeStartElement("", GRAPHML_TAG, GRAPHML_NS);
        xml.writeNamespace("", GRAPHML_NS);
        xml.writeAttribute(XMLNSXSI_ATT, GRAPHML_XMLNSXSI);
        xml.writeAttribute(XSISL_ATT, GRAPHML_XSISL);

        for (GraphMLProperty p : GraphMLProperty.values()) {
            p.declare(xml);
        }

        xml.writeStartElement(GRAPHML_NS, GRAPH_TAG);
        xml.writeAttribute(ID_ATT, GRAPH_ID);
        xml.writeAttribute(EDGEDEFAULT_ATT, EDGEDEFAULT_DEFAULT_VALUE);
        xml.writeAttribute(PARSENODEIDS_ATT, PARSENODEIDS_DEFAULT_VALUE);
        xml.writeAttribute(PARSEEDGEIDS_ATT, PARSEEDGEIDS_DEFAULT_VALUE);
        xml.writeAttribute(PARSEORDER_ATT, PARSEORDER_DEFAULT_VALUE);

        final VariantGraphRanking ranking = ranking();
        for (VariantGraph.Vertex vertex : graph.vertices()) {
            final int id = numericId(vertex);
            xml.writeStartElement(GRAPHML_NS, NODE_TAG);
            xml.writeAttribute(ID_ATT, "n" + id);
            GraphMLProperty.NODE_NUMBER.write(Integer.toString(id), xml);
            GraphMLProperty.NODE_RANK.write(Integer.toString(ranking.apply(vertex)), xml);
            GraphMLProperty.NODE_TOKEN.write(vertexToString.apply(vertex), xml);
            xml.writeEndElement();
        }

        int edgeNumber = 0;
        for (VariantGraph.Vertex v : graph.vertices()) {
            for (Map.Entry> edge : v.outgoing().entrySet()) {
                xml.writeStartElement(GRAPHML_NS, EDGE_TAG);
                xml.writeAttribute(ID_ATT, "e" + edgeNumber);
                xml.writeAttribute(SOURCE_ATT, "n" + numericId(v));
                xml.writeAttribute(TARGET_ATT, "n" + numericId(edge.getKey()));
                GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml);
                GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_PATH, xml);
                GraphMLProperty.EDGE_WITNESSES.write(edge.getValue().stream().map(Witness::getSigil).distinct().sorted().collect(Collectors.joining(", ")), xml);
                xml.writeEndElement();
            }
        }

        for (Tuple transposedTuple : transposedTuples()) {
            xml.writeStartElement(GRAPHML_NS, EDGE_TAG);
            xml.writeAttribute(ID_ATT, "e" + edgeNumber);
            xml.writeAttribute(SOURCE_ATT, "n" + numericId(transposedTuple.left));
            xml.writeAttribute(TARGET_ATT, "n" + numericId(transposedTuple.right));
            GraphMLProperty.EDGE_NUMBER.write(Integer.toString(edgeNumber++), xml);
            GraphMLProperty.EDGE_TYPE.write(EDGE_TYPE_TRANSPOSITION, xml);
            xml.writeEndElement();
        }

        xml.writeEndElement();

        xml.writeEndElement();
    }

    private static final String NODE_TAG = "node";
    private static final String TARGET_ATT = "target";
    private static final String SOURCE_ATT = "source";
    private static final String EDGE_TAG = "edge";
    private static final String EDGE_TYPE_PATH = "path";
    private static final String EDGE_TYPE_TRANSPOSITION = "transposition";
    private static final String EDGEDEFAULT_DEFAULT_VALUE = "directed";
    private static final String EDGEDEFAULT_ATT = "edgedefault";
    private static final String GRAPH_ID = "g0";
    private static final String GRAPH_TAG = "graph";
    private static final String GRAPHML_NS = "http://graphml.graphdrawing.org/xmlns";
    private static final String GRAPHML_TAG = "graphml";
    private static final String XMLNSXSI_ATT = "xmlns:xsi";
    private static final String XSISL_ATT = "xsi:schemaLocation";
    private static final String GRAPHML_XMLNSXSI = "http://www.w3.org/2001/XMLSchema-instance";
    private static final String GRAPHML_XSISL = "http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd";
    private static final String PARSENODEIDS_ATT = "parse.nodeids";
    private static final String PARSENODEIDS_DEFAULT_VALUE = "canonical";
    private static final String PARSEEDGEIDS_ATT = "parse.edgeids";
    private static final String PARSEEDGEIDS_DEFAULT_VALUE = "canonical";
    private static final String PARSEORDER_ATT = "parse.order";
    private static final String PARSEORDER_DEFAULT_VALUE = "nodesfirst";

    private static final String ATTR_TYPE_ATT = "attr.type";
    private static final String ATTR_NAME_ATT = "attr.name";
    private static final String FOR_ATT = "for";
    private static final String ID_ATT = "id";
    private static final String KEY_TAG = "key";
    private static final String DATA_TAG = "data";

    private enum GraphMLProperty {
        NODE_NUMBER(NODE_TAG, "number", "int"), //
        NODE_TOKEN(NODE_TAG, "tokens", "string"), //
        NODE_RANK(NODE_TAG, "rank", "int"), //
        EDGE_NUMBER(EDGE_TAG, "number", "int"), //
        EDGE_TYPE(EDGE_TAG, "type", "string"), //
        EDGE_WITNESSES(EDGE_TAG, "witnesses", "string");

        private String name;
        private String forElement;
        private String type;

        private GraphMLProperty(String forElement, String name, String type) {
            this.name = name;
            this.forElement = forElement;
            this.type = type;
        }

        public void write(String data, XMLStreamWriter xml) throws XMLStreamException {
            xml.writeStartElement(GRAPHML_NS, DATA_TAG);
            xml.writeAttribute(KEY_TAG, "d" + ordinal());
            xml.writeCharacters(data);
            xml.writeEndElement();
        }

        public void declare(XMLStreamWriter xml) throws XMLStreamException {
            xml.writeEmptyElement(GRAPHML_NS, KEY_TAG);
            xml.writeAttribute(ID_ATT, "d" + ordinal());
            xml.writeAttribute(FOR_ATT, forElement);
            xml.writeAttribute(ATTR_NAME_ATT, name);
            xml.writeAttribute(ATTR_TYPE_ATT, type);
        }
    }

    final Function vertexToString = new Function() {
        @Override
        public String apply(VariantGraph.Vertex input) {
            return input.witnesses().stream().findFirst()
                .map(witness -> tokensToString.apply(Arrays.asList(input.tokens().stream().filter(t -> witness.equals(t.getWitness())).toArray(Token[]::new))))
                .orElse("");
        }
    };

    static final Function, String> SIMPLE_TOKEN_TO_STRING = input -> StreamSupport.stream(input.spliterator(), false)
        .filter(t -> SimpleToken.class.isAssignableFrom(t.getClass()))
        .map(t -> (SimpleToken) t)
        .sorted()
        .map(SimpleToken::getContent)
        .collect(Collectors.joining());
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy