All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.simple.SimpleWitnessTeiBuilder Maven / Gradle / Ivy

/*
 * Copyright (c) 2015 The Interedition Development Group.
 *
 * This file is part of CollateX.
 *
 * CollateX is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CollateX is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CollateX.  If not, see .
 */

package eu.interedition.collatex.simple;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

// we are going to use stax
// previous version was build on DOM, which is not particularly well suited to parsing
// large TEI files, with segments and expansions
public class SimpleWitnessTeiBuilder {

    private static QName w = new QName("http://www.tei-c.org/ns/1.0", "w");
    private static QName seg = new QName("http://www.tei-c.org/ns/1.0", "seg");
    private static QName p = new QName("http://www.tei-c.org/ns/1.0", "p");

    public SimpleWitness read(InputStream input) throws XMLStreamException {
        SimpleWitness witness = new SimpleWitness("id");
        List tokenContents = new ArrayList<>();
        XMLInputFactory factory = XMLInputFactory.newInstance();
        XMLEventReader reader = factory.createXMLEventReader(input);
        XMLEvent event = null;
        int numberOfParagraphs = 0;
        while ((event = reader.peek()) != null && numberOfParagraphs < 10) {
            // System.out.println(event.toString());
            if (event.isStartElement() && event.asStartElement().getName().equals(w)) {
                tokenContents.add(handleWElement(reader));
            } else if (event.isStartElement() && event.asStartElement().getName().equals(seg)) {
                tokenContents.add(handleSegElement(reader));
            } else if (event.isStartElement() && event.asStartElement().getName().equals(p)) {
                reader.next();
                numberOfParagraphs++;
            } else {
                reader.next();
            }
        }
        witness.setTokenContents(tokenContents.stream(), SimpleTokenNormalizers.LC_TRIM_WS_PUNCT);
        return witness;
    }

    private static String handleWElement(XMLEventReader reader) throws XMLStreamException {
        XMLEvent event = reader.nextEvent();
        // Do what you need to do with the start element, e.g. initialize
        // data structures
        // System.out.println("W tag is triggered here!");
        StringBuffer textBuffer = new StringBuffer();
        while ((event = reader.peek()) != null) {
            if (event.isEndElement() && event.asEndElement().getName().equals(w)) {
                // Do what you need to do at the end, e.g. add data
                // collected from sub elements, etc.
                event = reader.nextEvent();
                break;
            } else {
                // Do what you need to do for start or child elements, e.g.
                // dispatch to another handler function
                event = reader.nextEvent();
                textBuffer.append(event.toString());
                // System.out.println("Text :"+event.toString());
            }
        }
        return textBuffer.toString();
    }

    private static String handleSegElement(XMLEventReader reader) throws XMLStreamException {
        XMLEvent event = reader.nextEvent();
        // Do what you need to do with the start element, e.g. initialize
        // data structures
        // System.out.println("Seg tag is triggered here!");
        StringBuffer textBuffer = new StringBuffer();
        while ((event = reader.peek()) != null) {
            if (event.isEndElement() && event.asEndElement().getName().equals(seg)) {
                // Do what you need to do at the end, e.g. add data
                // collected from sub elements, etc.
                event = reader.nextEvent();
                break;
            } else {
                // Do what you need to do for start or child elements, e.g.
                // dispatch to another handler function
                event = reader.nextEvent();
                if (event.getEventType() == XMLEvent.CHARACTERS) {
                    textBuffer.append(event.toString().trim());
                }
            }
        }
        return textBuffer.toString();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy