All Downloads are FREE. Search and download functionalities are using the official Maven repository.

mServer.crawler.sender.orf.tasks.OrfFilmDetailTask Maven / Gradle / Ivy

There is a newer version: 3.1.64
Show newest version
package mServer.crawler.sender.orf.tasks;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import de.mediathekview.mlib.daten.DatenFilm;
import de.mediathekview.mlib.tool.Log;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoUnit;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.ConcurrentLinkedQueue;
import mServer.crawler.CrawlerTool;
import mServer.crawler.sender.MediathekReader;
import mServer.crawler.sender.newsearch.Qualities;
import mServer.crawler.sender.orf.HtmlDocumentUtils;
import mServer.crawler.sender.orf.OrfEpisodeInfoDTO;
import mServer.crawler.sender.orf.OrfVideoInfoDTO;
import mServer.crawler.sender.orf.TopicUrlDTO;
import mServer.crawler.sender.orf.parser.OrfEpisodeDeserializer;
import mServer.crawler.sender.orf.parser.OrfVideoDetailDeserializer;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class OrfFilmDetailTask extends AbstractDocumentTask {

  private static final String TITLE_SELECTOR = "h3.video_headline";
  private static final String BROADCAST_SELECTOR = "div.broadcast_information";
  private static final String TIME_SELECTOR = BROADCAST_SELECTOR + " > time";
  private static final String DURATION_SELECTOR = BROADCAST_SELECTOR + " > span.meta_duration";
  private static final String DESCRIPTION_SELECTOR = "div.details_description";
  private static final String VIDEO_SELECTOR = "div.jsb_VideoPlaylist";
  private static final String EPISODE_SELECTOR = "li.jsb_PlaylistItemFullscreen";

  private static final String ATTRIBUTE_DATETIME = "datetime";
  private static final String ATTRIBUTE_DATA_JSB = "data-jsb";

  private static final DateTimeFormatter DATE_TIME_FORMATTER
          = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");

  private static final DateTimeFormatter DATE_FORMAT
          = DateTimeFormatter.ofPattern("dd.MM.yyyy");
  private static final DateTimeFormatter TIME_FORMAT
          = DateTimeFormatter.ofPattern("HH:mm:ss");

  private static final Type OPTIONAL_VIDEOINFO_TYPE_TOKEN = new TypeToken>() {
  }.getType();
  private static final Type OPTIONAL_EPISODEINFO_TYPE_TOKEN = new TypeToken>() {
  }.getType();

  public OrfFilmDetailTask(final MediathekReader aCrawler,
          final ConcurrentLinkedQueue aUrlToCrawlDTOs) {
    super(aCrawler, aUrlToCrawlDTOs);
  }

  @Override
  protected void processDocument(TopicUrlDTO aUrlDTO, Document aDocument) {
    final Optional title = HtmlDocumentUtils.getElementString(TITLE_SELECTOR, aDocument);
    final Optional time = parseDate(aDocument);
    final Optional duration = parseDuration(aDocument);
    final Optional description = HtmlDocumentUtils.getElementString(DESCRIPTION_SELECTOR, aDocument);

    final Optional videoInfoOptional = parseUrls(aDocument);

    createFilm(aUrlDTO, videoInfoOptional, title, description, time, duration);

    final List episodes = parseEpisodes(aDocument);
    episodes.forEach(episode -> {
      createFilm(aUrlDTO, Optional.of(episode.getVideoInfo()), episode.getTitle(), episode.getDescription(), time, episode.getDuration());
    });
  }

  @Override
  protected AbstractUrlTask createNewOwnInstance(ConcurrentLinkedQueue aURLsToCrawl) {
    return new OrfFilmDetailTask(crawler, aURLsToCrawl);
  }

  private void createFilm(final TopicUrlDTO aUrlDTO,
          final Optional aVideoInfo,
          final Optional aTitle,
          final Optional aDescription,
          final Optional aTime,
          final Optional aDuration) {

    try {
      if (aVideoInfo.isPresent() && aTitle.isPresent()) {
        OrfVideoInfoDTO videoInfo = aVideoInfo.get();

        LocalDateTime time = aTime.orElse(LocalDateTime.now());

        String datum = time.format(DATE_FORMAT);
        String zeit = time.format(TIME_FORMAT);
        String url = videoInfo.getDefaultVideoUrl();

        final DatenFilm film = new DatenFilm(crawler.getSendername(),
                aUrlDTO.getTopic(),
                aUrlDTO.getUrl(),
                aTitle.get(),
                url,
                "",
                datum,
                zeit,
                aDuration.orElse(Duration.ZERO).getSeconds(),
                aDescription.orElse(""));

        if (StringUtils.isNotBlank(videoInfo.getSubtitleUrl())) {
          CrawlerTool.addUrlSubtitle(film, videoInfo.getSubtitleUrl());
        }

        addUrls(film, videoInfo.getVideoUrls());

        taskResults.add(film);
      } else {
        Log.sysLog("OrfFilmDetailTask: no title or video found for url " + aUrlDTO.getUrl());
      }
    } catch (MalformedURLException ex) {
      Log.errorLog(984514561, ex);
    }
  }

  private void addUrls(final DatenFilm aFilm, final Map aVideoUrls)
          throws MalformedURLException {

    if (aVideoUrls.containsKey(Qualities.HD)) {
      CrawlerTool.addUrlHd(aFilm, aVideoUrls.get(Qualities.HD), "");
    }
    if (aVideoUrls.containsKey(Qualities.SMALL)) {
      CrawlerTool.addUrlKlein(aFilm, aVideoUrls.get(Qualities.SMALL), "");
    }
  }

  private Optional parseUrls(Document aDocument) {
    Optional json = HtmlDocumentUtils.getElementAttributeString(VIDEO_SELECTOR, ATTRIBUTE_DATA_JSB, aDocument);

    if (json.isPresent()) {

      final Gson gson = new GsonBuilder().registerTypeAdapter(OPTIONAL_VIDEOINFO_TYPE_TOKEN,
              new OrfVideoDetailDeserializer()).create();

      return gson.fromJson(json.get(), OPTIONAL_VIDEOINFO_TYPE_TOKEN);
    }

    return Optional.empty();
  }

  private static Optional parseDate(Document aDocument) {
    Optional date = HtmlDocumentUtils.getElementAttributeString(TIME_SELECTOR, ATTRIBUTE_DATETIME, aDocument);
    if (date.isPresent()) {
      String dateValue = date.get().replace("CET", " ").replace("CEST", " ");
      try {
        LocalDateTime localDate = LocalDateTime.parse(dateValue, DATE_TIME_FORMATTER);
        return Optional.of(localDate);
      } catch (DateTimeParseException e) {
        Log.sysLog("OrfFilmDetailTask: unknown date format: " + date.get());
      }
    }

    return Optional.empty();
  }

  private static Optional parseDuration(Document aDocument) {
    Optional duration = HtmlDocumentUtils.getElementString(DURATION_SELECTOR, aDocument);
    if (!duration.isPresent()) {
      return Optional.empty();
    }

    Optional unit = determineChronoUnit(duration.get());
    if (!unit.isPresent()) {
      Log.sysLog("OrfFilmDetailTask: unknown duration type: " + duration.get());
      return Optional.empty();
    }

    String[] parts = duration.get().split(" ")[0].trim().split(":");
    if (parts.length != 2) {
      Log.sysLog("OrfFilmDetailTask: unknown duration part count: " + duration.get());
      return Optional.empty();
    }

    ChronoUnit unitValue = unit.get();
    if (unitValue == ChronoUnit.MINUTES) {
      return Optional.of(
              Duration.ofMinutes(Long.parseLong(parts[0]))
                      .plusSeconds(Long.parseLong(parts[1]))
      );
    }
    if (unitValue == ChronoUnit.HOURS) {
      return Optional.of(
              Duration.ofHours(Long.parseLong(parts[0]))
                      .plusMinutes(Long.parseLong(parts[1]))
      );
    }

    return Optional.empty();
  }

  private static List parseEpisodes(final Document aDocument) {
    final List episodes = new ArrayList<>();

    Elements elements = aDocument.select(EPISODE_SELECTOR);
    elements.forEach(element -> {
      String json = element.attr(ATTRIBUTE_DATA_JSB);

      if (!json.isEmpty()) {

        final Gson gson = new GsonBuilder().registerTypeAdapter(OPTIONAL_EPISODEINFO_TYPE_TOKEN,
                new OrfEpisodeDeserializer()).create();

        Optional episode = gson.fromJson(json, OPTIONAL_EPISODEINFO_TYPE_TOKEN);
        if (episode.isPresent()) {
          episodes.add(episode.get());
        }
      }
    });

    return episodes;
  }

  private static Optional determineChronoUnit(String aDuration) {
    if (aDuration.contains("Min.")) {
      return Optional.of(ChronoUnit.MINUTES);
    }
    if (aDuration.contains("Std.")) {
      return Optional.of(ChronoUnit.HOURS);
    }

    return Optional.empty();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy