All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.mediathekview.mlib.tool.TimedTextMarkupLanguageParser Maven / Gradle / Ivy

Go to download

A central library with tools and utils for the MediathekView Client and the MediathekView Server

The newest version!
/*
 *    TimedTextMarkupLanguageParser
 *    Copyright (C) 2016 CrystalPalace
 *    [email protected]
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see .
 */
package de.mediathekview.mlib.tool;

import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Hashtable;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/**
 * Converter for TTML XML subtitle files into SubRip Text format.
 * Tested with MediathekView downloaded subtitles and TTML format version 1.0.
 */
public class TimedTextMarkupLanguageParser {

    private final SimpleDateFormat ttmlFormat = new SimpleDateFormat("HH:mm:ss.SS");
    private final SimpleDateFormat srtFormat = new SimpleDateFormat("HH:mm:ss,SS");
    private final SimpleDateFormat sdfFlash = new SimpleDateFormat("s.S");

    private final Map colorMap = new Hashtable<>();
    private final List subtitleList = new ArrayList<>();
    private String color = "#FFFFFF";
    private Document doc = null;

    public TimedTextMarkupLanguageParser() {
    }

    /**
     * Build a map of used colors within the TTML file.
     */
    private void buildColorMap() {
        final NodeList styleData = doc.getElementsByTagName("tt:style");
        for (int i = 0; i < styleData.getLength(); i++) {
            final Node subnode = styleData.item(i);
            if (subnode.hasAttributes()) {
                final NamedNodeMap attrMap = subnode.getAttributes();
                final Node idNode = attrMap.getNamedItem("xml:id");
                final Node colorNode = attrMap.getNamedItem("tts:color");
                if (idNode != null && colorNode != null) {
                    colorMap.put(idNode.getNodeValue(), colorNode.getNodeValue());
                }
            }
        }
    }

    /**
     * Build the Subtitle objects from TTML content.
     */
    @SuppressWarnings("deprecation")
    private void buildFilmList() throws Exception {
        final NodeList subtitleData = doc.getElementsByTagName("tt:p");

        for (int i = 0; i < subtitleData.getLength(); i++) {
            final Subtitle subtitle = new Subtitle();

            final Node subnode = subtitleData.item(i);
            if (subnode.hasAttributes()) {
                // retrieve the begin and end attributes...
                final NamedNodeMap attrMap = subnode.getAttributes();
                final Node beginNode = attrMap.getNamedItem("begin");
                final Node endNode = attrMap.getNamedItem("end");
                if (beginNode != null && endNode != null) {
                    subtitle.begin = ttmlFormat.parse(beginNode.getNodeValue());
                    //HACK:: Don´t know why this is set like this...
                    //but we have to subract 10 hours from the XML
                    if (subtitle.begin.getHours() >= 10) {
                        subtitle.begin.setHours(subtitle.begin.getHours() - 10);
                    }
                    subtitle.end = ttmlFormat.parse(endNode.getNodeValue());
                    if (subtitle.end.getHours() >= 10) {
                        subtitle.end.setHours(subtitle.end.getHours() - 10);
                    }

                }
            }

            final NodeList childNodes = subnode.getChildNodes();
            for (int j = 0; j < childNodes.getLength(); j++) {
                final Node node = childNodes.item(j);
                if (node.getNodeName().equalsIgnoreCase("tt:span")) {
                    //retrieve the text and color information...
                    final NamedNodeMap attrMap = node.getAttributes();
                    final Node styleNode = attrMap.getNamedItem("style");
                    final StyledString textContent = new StyledString();

                    textContent.setText(node.getTextContent());
                    final String col = colorMap.get(styleNode.getNodeValue());
                    if (col == null) {
                        textContent.setColor(color); // gabs beim BR
                    } else {
                        textContent.setColor(colorMap.get(styleNode.getNodeValue()));
                    }
                    subtitle.listOfStrings.add(textContent);
                }
            }
            subtitleList.add(subtitle);
        }
    }

    private Date parseFlash(String tStamp) throws ParseException {
        Date da;
        if (tStamp.contains(":")) {
            da = ttmlFormat.parse(tStamp);
        } else {
            da = sdfFlash.parse(tStamp + "00");
        }
        return da;
    }

    /**
     * Build the Subtitle objects from TTML content.
     */
    private void buildFilmListFlash() throws Exception {
        final NodeList subtitleData = doc.getElementsByTagName("p");

        for (int i = 0; i < subtitleData.getLength(); i++) {
            final Subtitle subtitle = new Subtitle();

            final Node subnode = subtitleData.item(i);
            if (subnode.hasAttributes()) {
                // retrieve the begin and end attributes...
                final NamedNodeMap attrMap = subnode.getAttributes();
                final Node beginNode = attrMap.getNamedItem("begin");
                final Node endNode = attrMap.getNamedItem("end");
                if (beginNode != null && endNode != null) {
                    subtitle.begin = parseFlash(beginNode.getNodeValue());
                    subtitle.end = parseFlash(endNode.getNodeValue());
                    final StyledString textContent = new StyledString();
                    textContent.setColor(color); // sicher ist sicher
                    textContent.setText(subnode.getTextContent());

                    final Node col = attrMap.getNamedItem("tts:color");
                    if (col != null) {
                        textContent.setColor(col.getNodeValue());
                    } else {
                        final NodeList childNodes = subnode.getChildNodes();
                        for (int j = 0; j < childNodes.getLength(); j++) {
                            final Node node = childNodes.item(j);
                            if (node.getNodeName().equalsIgnoreCase("span")) {
                                //retrieve the text and color information...
                                final NamedNodeMap attr = node.getAttributes();
                                final Node co = attr.getNamedItem("tts:color");
                                textContent.setColor(co.getNodeValue());
                            }
                        }
                    }
                    subtitle.listOfStrings.add(textContent);

                }
            }
            subtitleList.add(subtitle);
        }
    }

    /**
     * Parse the TTML file into internal representation.
     *
     * @param ttmlFilePath the TTML file to parse
     * @return true if the parsing was successful
     */
    public boolean parse(Path ttmlFilePath) {
        boolean ret;
        try {
            final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            dbf.setNamespaceAware(true);

            final DocumentBuilder db = dbf.newDocumentBuilder();
            doc = db.parse(ttmlFilePath.toFile());

            //Check that we have TTML v1.0 file as we have tested only them...
            final NodeList metaData = doc.getElementsByTagName("ebuttm:documentEbuttVersion");
            if (metaData != null) {
                final Node versionNode = metaData.item(0);
                if (versionNode == null || !versionNode.getTextContent().equalsIgnoreCase("v1.0")) {
                    throw new Exception("Unknown TTML file version");
                }
            } else {
                throw new Exception("Unknown File Format");
            }

            buildColorMap();
            buildFilmList();
            ret = true;
        } catch (Exception ex) {
            Log.errorLog(912036478, new String[]{ex.getLocalizedMessage(), "File: " + ttmlFilePath});
            ret = false;
        }
        return ret;
    }

    /**
     * Parse the XML Subtitle File for Flash Player into internal representation.
     *
     * @param ttmlFilePath the TTML file to parse
     * @return true if the parsing was successful
     */
    public boolean parseXmlFlash(Path ttmlFilePath) {
        boolean ret;
        try {
            final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            dbf.setNamespaceAware(true);

            final DocumentBuilder db = dbf.newDocumentBuilder();
            doc = db.parse(ttmlFilePath.toFile());

            //Check that we have TTML v1.0 file as we have tested only them...
            final NodeList metaData = doc.getElementsByTagName("tt");
            final NodeList colorNote = doc.getElementsByTagName("style");
            if (metaData != null) {
                final Node node = metaData.item(0);

                if (node.hasAttributes()) {
                    // retrieve the begin and end attributes...
                    final NamedNodeMap attrMap = node.getAttributes();
                    final Node xmlns = attrMap.getNamedItem("xmlns");
                    if (xmlns != null) {
                        final String s = xmlns.getNodeValue();
                        if (!s.equals("http://www.w3.org/2006/04/ttaf1")
                                && !s.equals("http://www.w3.org/ns/ttml")) {
                            throw new Exception("Unknown TTML file version");
                        }
                    }
                } else {
                    throw new Exception("Unknown File Format");
                }
            } else {
                throw new Exception("Unknown File Format");
            }
            if (colorNote != null) {
                if (colorNote.getLength() == 0) {
                    this.color = "#FFFFFF";
                } else {
                    final Node node = colorNote.item(0);

                    if (node.hasAttributes()) {
                        // retrieve the begin and end attributes...
                        final NamedNodeMap attrMap = node.getAttributes();
                        final Node col = attrMap.getNamedItem("tts:color");
                        if (col != null) {
                            if (!col.getNodeValue().isEmpty()) {
                                this.color = col.getNodeValue();
                            }
                        }
                    } else {
                        throw new Exception("Unknown File Format");
                    }
                }
            } else {
                throw new Exception("Unknown File Format");
            }
            buildFilmListFlash();
            ret = true;
        } catch (Exception ex) {
            //Log.errorLog(46231470, ex, "File: " + ttmlFilePath);
            Log.errorLog(46231470, new String[]{ex.getLocalizedMessage(), "File: " + ttmlFilePath});
            ret = false;
        }
        return ret;
    }

    /**
     * Convert internal representation into SubRip Text Format and save to file.
     * @param srtFile The path to the srt file to convert
     */
    public void toSrt(Path srtFile) {
        try (FileOutputStream fos = new FileOutputStream(srtFile.toFile());
             OutputStreamWriter osw = new OutputStreamWriter(fos, Charset.forName("UTF-8"));
             PrintWriter writer = new PrintWriter(osw)) {
            long counter = 1;
            for (Subtitle title : subtitleList) {
                writer.println(counter);
                writer.println(srtFormat.format(title.begin) + " --> " + srtFormat.format(title.end));
                for (StyledString entry : title.listOfStrings) {
                    if (!entry.color.isEmpty()) {
                        writer.print("");
                    }
                    writer.print(entry.text);
                    if (!entry.color.isEmpty()) {
                        writer.print("");
                    }
                    writer.println();
                }
                writer.println("");
                counter++;
            }
        } catch (Exception ex) {
            Log.errorLog(201036470, ex, "File: " + srtFile);
        }
    }

    public void cleanup() {
        colorMap.clear();
        subtitleList.clear();
    }

    private class StyledString {

        public String getText() {
            return text;
        }

        public void setText(String text) {
            this.text = text;
        }

        public String getColor() {
            return color;
        }

        public void setColor(String color) {
            this.color = color;
        }

        private String text = "";
        private String color = "";
    }

    private class Subtitle {

        public Date begin;
        public Date end;
        public List listOfStrings = new ArrayList<>();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy