All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nl.vpro.domain.subtitles.EBU Maven / Gradle / Ivy

Go to download

Subtitles are related to media, but are implemented completely parallel. Classes to contain, parse, and assemble subtitle objects are here.

The newest version!
package nl.vpro.domain.subtitles;

import lombok.extern.slf4j.Slf4j;

import java.io.IOException;
import java.io.InputStream;
import java.time.Duration;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Function;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

/**
 * Code borrowed from JDaren@github
 * 

* Adapted so it returns a stream of our {@link Cue} objects * @author Michiel Meeuwissen * @since 5.3 */ @Slf4j public class EBU { private EBU() { } public static Stream parse(String parent, Duration offset, Function offsetGuesser, InputStream is) { return StreamSupport.stream(Spliterators.spliteratorUnknownSize(parseToIterator(parent, offset, offsetGuesser, is), Spliterator.ORDERED), false); } protected static Iterator parseToIterator(final String parent, final Duration offsetArgument, Function offsetGuesser, final InputStream is) { if (is == null) { throw new IllegalArgumentException("Inputstream cannot be null"); } return new Iterator() { private Cue next; private Boolean hasNext = null; private int i = 0; final byte[] ttiBlock = new byte[128]; final int numberOfTTIBlocks; final int fps; final int numberOfSubtitles; int subtitleNumber = 1; Duration offset = offsetArgument; { byte[] gsiBlock = new byte[1024]; try { //we read the file //but first we create the possible styles int bytesRead = is.read(gsiBlock); //the GSI block is loaded if (bytesRead < 1024) { //the file must contain at least a GSI block and a TTI block //this is a fatal parsing error. throw new IllegalArgumentException("The file must contain at least a GSI block, but read only " + bytesRead + " (< 1024)"); } //CPC : code page number 0..2 //DFC : disk format code 3..10 //save the number of frames per second byte[] dfc = {gsiBlock[6], gsiBlock[7]}; fps = Integer.parseInt(new String(dfc)); //DSC : Display Standard Code 11 //CCT : Character Code Table number 12..13 byte[] cct = {gsiBlock[12], gsiBlock[13]}; int table = Integer.parseInt(new String(cct)); //LC : Language Code 14..15 //OPT : Original Programme Title 16..47 byte[] opt = new byte[32]; System.arraycopy(gsiBlock, 16, opt, 0, 32); String title = new String(opt); //OEP : Original Episode Title 48..79 byte[] oet = new byte[32]; System.arraycopy(gsiBlock, 48, oet, 0, 32); String episodeTitle = new String(oet); //TPT : Translated Programme Title 80..111 //TEP : Translated Episode Title 112..143 //TN : Translator's Name 144..175 //TCD : Translators Contact Details 176..207 //SLR : Subtitle List Reference code 208..223 //CD : Creation Date 224..229 //RD : Revision Date 230..235 //RN : Revision Number 236..237 //TNB : Total Number of TTI Blocks 238..242 byte[] tnb = {gsiBlock[238], gsiBlock[239], gsiBlock[240], gsiBlock[241], gsiBlock[242]}; numberOfTTIBlocks = Integer.parseInt(new String(tnb).trim()); //TNS : Total Number of Subtitles 243..247 byte[] tns = {gsiBlock[243], gsiBlock[244], gsiBlock[245], gsiBlock[246], gsiBlock[247]}; numberOfSubtitles = Integer.parseInt(new String(tns).trim()); //TNG : Total Number of Subtitle Groups 248..250 //MNC : Max Number of characters in row 251..252 //MNR : Max number of rows 253..254 //TCS : Time Code: Status 255 //TCP : Time Code: Start-of-Programme 256..263 //TCF : Time Code: First In-Cue 264..271 //TND : Total Number of Disks 272 //DSN : Disk Sequence Number 273 //CO : Country of Origin 274..276 //PUB : Publisher 277..308 //EN : Editor's Name 309..340 //ECD : Editor's Contact Details 341..372 // Spare bytes 373..447 //UDA : User-Defined Area 448..1023 //we add the title //this checks the reference to the characters coding employed. if (table > 4 || table < 0) { log.error("Invalid Character Code table number, corrupt data? will try to parse anyways assuming it is latin."); } else if (table != 0) { log.error("Only latin alphabet supported for import from STL, other languages may produce unexpected results."); } int subtitleNumber = 1; } catch (IOException e) { log.error(e.getMessage(), e); throw new RuntimeException(e); } } @Override public boolean hasNext() { findNext(); return hasNext; } @Override public Cue next() { findNext(); if (!hasNext()) { throw new NoSuchElementException(); } hasNext = null; return next; } private void findNext() { if (hasNext == null) { next = null; try { Cue toFill = new Cue(); toFill.parent = parent; boolean additionalText = false; StringBuilder currentContent = new StringBuilder(); //the TTI blocks are read while (i < numberOfTTIBlocks && next == null) { i++; //the TTI block is loaded int bytesRead = is.read(ttiBlock); if (bytesRead < 128) { //unexpected end of file log.error("Unexpected end of file, {} blocks read, expecting {} blocks in total.", i, numberOfTTIBlocks); break; } //SGN : Subtitle group number 0 //SN : Subtitle Number 1..2 int currentSubNumber = Math.abs(ttiBlock[1] + 256 * ttiBlock[2]); if (currentSubNumber != subtitleNumber) { //missing subtitle number? log.debug("Unexpected subtitle number {} (expected {}) at TTI block {} Parsing proceeds...", subtitleNumber, currentSubNumber, i); } toFill.sequence = subtitleNumber; toFill.identifier = "" + subtitleNumber; //EBN : Extension Block Number 3 int ebn = ttiBlock[3]; if (ebn != -1 && ebn != -2) { additionalText = true; } else if (ebn == -2) { //EBN is UserData so Jump it. additionalText = false; continue; } else { additionalText = false; } //CS : Cumulative Status 4 //TCI : Time Code In 5..8 String startTime = "" + ttiBlock[5] + ":" + ttiBlock[6] + ":" + ttiBlock[7] + ":" + ttiBlock[8]; //TCO : Time Code Out 9..12 String endTime = "" + ttiBlock[9] + ":" + ttiBlock[10] + ":" + ttiBlock[11] + ":" + ttiBlock[12]; //VP : Vertical Position 13 //JC : Justification Code 14 int justification = ttiBlock[14]; //0:none, 1:left, 2:centered, 3:right //CF : Comment Flag 15 if (ttiBlock[15] == 0) { //comments are ignored //TF : Text Field 16..112 byte[] textField = new byte[112]; System.arraycopy(ttiBlock, 16, textField, 0, 112); if (additionalText) //if it is just additional text for the caption parseTextForSTL(currentContent, textField, justification); else { TimeLine timeLine = parseTime(startTime, endTime, fps); if (offset == null) { offset = offsetGuesser.apply(timeLine); } toFill.start = parseTime(startTime + "/" + fps).minus(offset); toFill.end = parseTime(endTime + "/" + fps).minus(offset); parseTextForSTL(currentContent, textField, justification); } } //we increase the subtitle number if (!additionalText) { subtitleNumber++; next = toFill; break; } } if (next != null) { hasNext = true; next.content = currentContent.toString().trim(); } else { hasNext = false; //we close the reader is.close(); if (subtitleNumber != numberOfSubtitles) { log.debug("Number of parsed subtitles ({}) different from expected number of subtitles ({})", subtitleNumber, numberOfSubtitles); } } } catch (Exception e) { hasNext = false; //format error throw new IllegalArgumentException("Format error in the file, might be due to corrupt data." + e.getMessage(), e); } } } }; } private static TimeLine parseTime(String startTime, String endTime, int fps) { return new TimeLine(null, parseTime(startTime + "/" + fps), parseTime(endTime + "/" + fps)); } /** * This method parses the text field taking into account STL control codes */ private static void parseTextForSTL(StringBuilder currentCaption, byte[] textField, int justification) { boolean italics = false; boolean underline = false; int diacritical_mark = 0; String color = "white"; String text = ""; //we go around the field in pair of bytes to decode them for (int i = 0; i < textField.length; i++) { if (textField[i] < 0) { //first byte > 8 (4 bits) if (textField[i] <= -113) { //we might be with a control code if (i + 1 < textField.length && textField[i] == textField[i + 1]) i++; //if repeated skip one switch (textField[i]) { case -128: italics = true; break; case -127: italics = false; break; case -126: underline = true; break; case -125: underline = false; break; case -124: //Boxing not supported break; case -123: //Boxing not supported break; case -118: //line break currentCaption.append(text.trim()).append("\n"); text = ""; break; case -113: //end of caption currentCaption.append(text.trim()); text = ""; //we check the style if (underline) color += "U"; if (italics) color += "I"; //style = tto.styling.get(color); if (justification == 1) { color += "L"; } else if (justification == 3) { color += "R"; } //we end the loop i = textField.length; break; default: //non valid code... } } else if (textField[i] >= -64 && textField[i] <= -49) { // diacritical characters diacritical_mark = textField[i]; } else { //other codes and non supported characters... //corresponds to the upper half of the character code table } } else if (textField[i] < 32) { //it is a teletext control code, only colors are supported if (i + 1 < textField.length && textField[i] == textField[i + 1]) i++; //if repeated skip one switch (textField[i]) { case 7: color = "white"; break; case 2: color = "green"; break; case 4: color = "blue"; break; case 6: color = "cyan"; break; case 1: color = "red"; break; case 3: color = "yellow"; break; case 5: color = "magenta"; break; case 0: color = "black"; break; default: //non supported } } else { //we have a supported character coded in the two bytes, range is from 0x20 to 0x7F byte[] x = {textField[i]}; String raw_string = new String(x); if (diacritical_mark != 0) { if ((diacritical_mark == -62) && (textField[i] == 101)) raw_string = "é"; else if ((diacritical_mark == -56) && (textField[i] == 105)) raw_string = "ï"; else if ((diacritical_mark == -63) && (textField[i] == 97)) raw_string = "à"; else if ((diacritical_mark == -56) && (textField[i] == 101)) raw_string = "ë"; else if ((diacritical_mark == -61) && (textField[i] == 101)) raw_string = "ê"; else if ((diacritical_mark == -63) && (textField[i] == 117)) raw_string = "ù"; else if ((diacritical_mark == -61) && (textField[i] == 105)) raw_string = "î"; else if ((diacritical_mark == -63) && (textField[i] == 101)) raw_string = "è"; else if ((diacritical_mark == -61) && (textField[i] == 97)) raw_string = "â"; else if ((diacritical_mark == -61) && (textField[i] == 111)) raw_string = "ô"; else if ((diacritical_mark == -61) && (textField[i] == 117)) raw_string = "û"; else if ((diacritical_mark == -53) && (textField[i] == 99)) raw_string = "ç"; else if ((diacritical_mark == -56) && (textField[i] == 97)) raw_string = "ä"; else if ((diacritical_mark == -56) && (textField[i] == 111)) raw_string = "ö"; else if ((diacritical_mark == -56) && (textField[i] == 117)) raw_string = "ü"; diacritical_mark = 0; } text += raw_string; } } } /** * @param value string in the correct format */ private static Duration parseTime(String value) { int h, m, s, f; String[] args = value.split("/"); float fps = Float.parseFloat(args[1]); args = args[0].split(":"); h = Integer.parseInt(args[0]); m = Integer.parseInt(args[1]); s = Integer.parseInt(args[2]); f = Integer.parseInt(args[3]); return Duration.ofMillis((long) (f * 1000L / fps) + s * 1000L + m * 60000L + h * 3600000L); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy