org.tinymediamanager.scraper.util.ParserUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tinyMediaManager Show documentation
The newest version!
/*
 * Copyright 2012 - 2019 Manuel Laggner
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tinymediamanager.scraper.util;

import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.WordUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tinymediamanager.core.Utils;
import org.tinymediamanager.core.movie.MovieModuleManager;
import org.tinymediamanager.core.tvshow.TvShowModuleManager;
import org.w3c.tidy.Tidy;

/**
 * Various parses methods to get a clean and workable name out of weird filenames
 * 
 * @author Myron Boyle
 */
public class ParserUtils {

  private static final Logger LOGGER     = LoggerFactory.getLogger(ParserUtils.class);
  private static final String DELIMITER  = "[\\[\\](){} _,.-]";

  public static String[]      stopwords  = { "1080", "1080i", "1080p", "2160p", "2160i", "3d", "480i", "480p", "576i", "576p", "720", "720i", "720p",
      "ac3", "ac3ld", "ac3md", "aoe", "atmos", "bd5", "bdrip", "bdrip", "blueray", "bluray", "brrip", "cam", "cd1", "cd2", "cd3", "cd4", "cd5", "cd6",
      "cd7", "cd8", "cd9", "complete", "custom", "dc", "disc1", "disc2", "disc3", "disc4", "disc5", "disc6", "disc7", "disc8", "disc9", "divx",
      "divx5", "dl", "docu", "dsr", "dsrip", "dts", "dtv", "dubbed", "dutch", "dvd", "dvd1", "dvd2", "dvd3", "dvd4", "dvd5", "dvd6", "dvd7", "dvd8",
      "dvd9", "dvdivx", "dvdrip", "dvdscr", "dvdscreener", "emule", "etm", "extended", "fragment", "fs", "fps", "german", "h264", "hd", "hddvd",
      "hdrip", "hdtv", "hdtvrip", "hevc", "hrhd", "hrhdtv", "ind", "internal", "ld", "limited", "ma", "md", "multisubs", "nfo", "nfofix", "ntg",
      "ntsc", "ogg", "ogm", "pal", "pdtv", "proper", "pso", "r3", "r5", "read", "repack", "rerip", "remux", "retail", "roor", "rs", "rsvcd",
      "screener", "se", "subbed", "svcd", "swedish", "tc", "telecine", "telesync", "ts", "truehd", "uncut", "unrated", "vcf", "webdl", "webrip",
      "workprint", "ws", "www", "x264", "xf", "xvid", "xvidvd", "xxx" };

  // clean before splitting (needs delimiter in front!)
  public static String[]      cleanwords = { "24\\.000", "23\\.976", "23\\.98", "24\\.00" };

  /**
   * Tries to get movie name from filename

   * 1. splits string using common delimiters ".- ()"

   * 2. searches for first occurrence of common stopwords

   * 3. if last token is 4 digits, assume year and remove

   * 4. everything before the first stopword must be the movie name :p

   * 

   * Deprecated in favor of detectCleanMovienameAndYear (avoid possible dupes)
   * 
   * @param filename
   *          the filename to get the title from
   * @return the (hopefully) correct parsed movie name
   */
  @Deprecated
  public static String detectCleanMoviename(String filename) {
    return detectCleanMovienameAndYear(filename)[0];
  }

  /**
   * Tries to get movie name and year from filename

   * 1. splits string using common delimiters ".- ()"

   * 2. searches for first occurrence of common stopwords

   * 3. if last token is 4 digits, assume year and set [1]

   * 4. everything before the first stopword must be the movie name :p
   * 
   * @param filename
   *          the filename to get the title from
   * @return title/year string (year can be empty)
   */
  @Deprecated
  public static String[] detectCleanMovienameAndYearOLD(String filename) {
    String[] ret = { "", "" };
    // use trace to not remove logging completely (function called way to often on multi movie dir parsing)
    LOGGER.trace("Parse filename for movie title: \"" + filename + "\"");

    if (filename == null || filename.isEmpty()) {
      LOGGER.warn("Filename empty?!");
      return ret;
    }

    // remove extension (if found) and split (keep var)
    String fname = filename.replaceFirst("\\.\\w{2,4}$", "");

    // replaces any resolution 1234x1234 (must start with a non-word (else too global)
    String cleaned = fname.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " ");
    // replace FPS specific words (must start with a non-word (else too global)
    for (String cw : cleanwords) {
      cleaned = cleaned.replaceFirst("(?i)\\W" + cw, " ");
    }

    String[] s = cleaned.split(DELIMITER);
    int firstFoundStopwordPosition = s.length;

    // iterate over all splitted items
    for (int i = 0; i < s.length; i++) {
      // search for stopword position
      if (s[i] != null && !s[i].isEmpty()) {
        for (String stop : stopwords) {
          if (s[i].equalsIgnoreCase(stop)) {
            s[i] = ""; // delete stopword
            // remember lowest position, but not lower than 2!!!
            if (i < firstFoundStopwordPosition && i >= 2) {
              firstFoundStopwordPosition = i;
            }
          }
        }
        if (Utils.isValidImdbId(s[i])) {
          s[i] = ""; // delete imdbId from name
        }
      }
    }

    // scan backwards - if we have at least 1 token, and the last one is a 4 digit, assume year and remove
    String year = "";
    for (int i = s.length - 1; i > 0; i--) {
      if (!s[i].isEmpty() && s[i].matches("\\d{4}")) {
        int currentYear = Calendar.getInstance().get(Calendar.YEAR);
        int parsedYear = Integer.parseInt(s[i]);
        if (parsedYear > 1800 && parsedYear < currentYear + 5) {
          // well, limit the year a bit...
          LOGGER.trace("removed token '" + s[i] + "'- seems to be year");
          year = s[i];
          s[i] = "";
          break;
        }
      }
    }

    // rebuild string, respecting bad words
    StringBuilder name = new StringBuilder();
    for (int i = 0; i < firstFoundStopwordPosition; i++) {
      if (!(s[i] != null && s[i].isEmpty())) {
        // check for bad words
        if (!MovieModuleManager.SETTINGS.getBadWord().contains(s[i].toLowerCase(Locale.ROOT))) {
          String word = s[i];
          // roman characters such as "Part Iv" should not be camel-cased
          switch (word.toUpperCase(Locale.ROOT)) {
            case "I":
            case "II":
            case "III":
            case "IV":
            case "V":
            case "VI":
            case "VII":
            case "VIII":
            case "IX":
            case "X":
              name.append(word.toUpperCase(Locale.ROOT)).append(" ");
              break;

            default:
              name.append(WordUtils.capitalizeFully(word)).append(" "); // make CamelCase
              break;
          }
        }
      }
    }

    if (name.length() == 0) {
      // started with a badword - return name unchanged
      ret[0] = fname;
    }
    else {
      ret[0] = name.toString().trim();
    }
    ret[1] = year.trim();
    LOGGER.trace("Movie title should be: \"" + ret[0] + "\", from " + ret[1]);
    return ret;
  }

  /**
   * Tries to get movie name and year from filename

   * 1. splits string using common delimiters ".- ()"

   * 2. searches for first occurrence of common stopwords

   * 3. if last token is 4 digits, assume year and set [1]

   * 4. everything before the first stopword must be the movie name :p
   * 
   * @param filename
   *          the filename to get the title from
   * @return title/year string (year can be empty)
   */
  public static String[] detectCleanMovienameAndYear(String filename) {
    String[] ret = { "", "" };
    // use trace to not remove logging completely (function called way to often on multi movie dir parsing)
    LOGGER.trace("Parse filename for movie title: \"" + filename + "\"");

    if (filename == null || filename.isEmpty()) {
      LOGGER.warn("Filename empty?!");
      return ret;
    }

    // remove extension (if found) and split (keep var)
    String fname = filename.replaceFirst("\\.\\w{2,4}$", "");
    // replaces any resolution 1234x1234 (must start and end with a non-word (else too global)
    fname = fname.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " ");
    // replace FPS specific words (must start with a non-word (else too global)
    for (String cw : cleanwords) {
      fname = fname.replaceFirst("(?i)\\W" + cw, " ");
    }

    LOGGER.trace("--------------------");
    LOGGER.trace("IN:  " + fname);

    // Get [optionals] delimited
    List opt = new ArrayList<>();
    Pattern p = Pattern.compile("\\[(.*?)\\]");
    Matcher m = p.matcher(fname);
    while (m.find()) {
      LOGGER.trace("OPT: " + m.group(1));
      String[] o = StringUtils.split(m.group(1), DELIMITER);
      opt.addAll(Arrays.asList(o));
      fname = fname.replace(m.group(), ""); // remove complete group from name
    }
    LOGGER.trace("ARR: " + opt);

    // detect OTR recordings - at least with that special pattern
    p = Pattern.compile(".*?(_\\d{2}\\.\\d{2}\\.\\d{2}[_ ]+\\d{2}\\-\\d{2}\\_).*"); // like _12.11.17_20-15_
    m = p.matcher(fname);
    if (m.matches() && m.start(1) > 10) {
      // start at some later point, not that if pattern is first
      LOGGER.trace("OTR: " + m.group(1));
      fname = fname.substring(0, m.start(1));
    }

    // parse good filename
    String[] s = StringUtils.split(fname, DELIMITER);
    if (s.length == 0) {
      s = opt.toArray(new String[opt.size()]);
    }
    int firstFoundStopwordPosition = s.length;

    // iterate over all splitted items
    for (int i = 0; i < s.length; i++) {
      // search for stopword position
      for (String stop : stopwords) {
        if (s[i].equalsIgnoreCase(stop)) {
          s[i] = ""; // delete stopword
          // remember lowest position, but not lower than 2!!!
          if (i < firstFoundStopwordPosition && i >= 2) {
            firstFoundStopwordPosition = i;
          }
        }
      }
      if (Utils.isValidImdbId(s[i])) {
        s[i] = ""; // delete imdbId from name
      }
    }

    // scan backwards - if we have at least 1 token, and the last one is a 4 digit, assume year and remove
    int currentYear = Calendar.getInstance().get(Calendar.YEAR);
    String year = "";
    for (int i = s.length - 1; i > 0; i--) {
      if (s[i].matches("\\d{4}")) {
        int parsedYear = Integer.parseInt(s[i]);
        if (parsedYear > 1800 && parsedYear < currentYear + 5) {
          // well, limit the year a bit...
          LOGGER.trace("removed token '" + s[i] + "'- seems to be year");
          year = s[i];
          s[i] = "";
          break;
        }
      }
    }
    if (year.isEmpty()) {
      // parse all optional tags for it
      for (String o : opt) {
        if (o.matches("\\d{4}")) {
          int parsedYear = Integer.parseInt(o);
          if (parsedYear > 1800 && parsedYear < currentYear + 5) {
            year = String.valueOf(parsedYear);
            LOGGER.trace("found possible year " + o);
          }
        }
      }
    }

    // rebuild string, respecting bad words
    StringBuilder name = new StringBuilder();
    for (int i = 0; i < firstFoundStopwordPosition; i++) {
      if (!s[i].isEmpty()) {
        // check for bad words
        if (!MovieModuleManager.SETTINGS.getBadWord().contains(s[i].toLowerCase(Locale.ROOT))) {
          String word = s[i];
          // roman characters such as "Part Iv" should not be camel-cased
          switch (word.toUpperCase(Locale.ROOT)) {
            case "I":
            case "II":
            case "III":
            case "IV":
            case "V":
            case "VI":
            case "VII":
            case "VIII":
            case "IX":
            case "X":
              name.append(word.toUpperCase(Locale.ROOT)).append(" ");
              break;

            default:
              name.append(WordUtils.capitalizeFully(word)).append(" "); // make CamelCase
              break;
          }
        }
      }
    }

    if (name.length() == 0) {
      // started with a badword - return name unchanged
      ret[0] = fname;
    }
    else {
      ret[0] = name.toString().trim();
    }
    ret[1] = year.trim();
    LOGGER.trace("Movie title should be: \"" + ret[0] + "\", from " + ret[1]);

    return ret;
  }

  /**
   * gets IMDB id out of filename
   * 
   * @param text
   *          a string
   * @return imdbid or empty
   */
  public static String detectImdbId(String text) {
    String imdb = "";
    if (text != null && !text.isEmpty()) {
      imdb = StrgUtils.substr(text, ".*(tt\\d{7}).*");
      if (imdb.isEmpty()) {
        imdb = StrgUtils.substr(text, ".*imdb\\.com\\/Title\\?(\\d{7}).*");
        if (!imdb.isEmpty()) {
          imdb = "tt" + imdb;
        }
      }
    }
    return imdb;
  }

  /**
   * removes some weird number-stopwords like 1080, 720 etc.. to ease the regex parsing for season/episode
   * 
   * @param filename
   *          the file name to remove the stop- and bad words for
   * @return the cleaned one
   */
  public static String removeStopwordsAndBadwordsFromTvEpisodeName(String filename) {
    String before = filename;

    // replaces any resolution 1234x1234 (must start with a non-word (else too global)
    filename = filename.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " ");

    for (String s : stopwords) {
      filename = filename.replaceAll("(?i)\\W" + s + "(\\W|$)", " "); // TV stop words must start AND END with a non-word (else too global) or line
                                                                      // end
      if (LOGGER.isTraceEnabled() && filename.length() != before.length()) {
        LOGGER.trace("Removed some TV stopword (" + s + "): " + before + " -> " + filename);
        before = filename;
      }
    }

    // also remove bad words
    for (String s : TvShowModuleManager.SETTINGS.getBadWord()) {
      filename = filename.replaceAll("(?i)\\W" + s + "(\\W|$)", " "); // TV bad words must start AND END with a non-word (else too global) or line end
      if (LOGGER.isTraceEnabled() && filename.length() != before.length()) {
        LOGGER.trace("Removed some TV bad word (" + s + "): " + before + " -> " + filename);
        before = filename;
      }
    }
    return filename;
  }

  /**
   * return a 2 element array. 0 = title; 1=date
   * 
   * parses the title in the format Title YEAR or Title (YEAR)
   * 
   * @param title
   *          the title
   * @return the string[]
   */
  public static String[] parseTitle(String title) {
    String v[] = { "", "" };
    if (title == null)
      return v;

    Pattern p = Pattern.compile("(.*)\\s+\\(?([0-9]{4})\\)?", Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(title);
    if (m.find()) {
      v[0] = m.group(1);
      v[1] = m.group(2);
    }
    else {
      v[0] = title;
    }
    return v;
  }

  /**
   * Parses titles if they are in the form Title (Year). The first element is the title, and the second element is the date, both can be null. If the
   * matcher fails to find the pattern, then the passed in title is set as the first element, which is the title.
   * 
   * @param title
   *          the title
   * @return the pair
   */
  public static Pair parseTitleAndDateInBrackets(String title) {
    if (title == null)
      return new Pair<>(null, null);

    Pattern p = Pattern.compile("(.*)\\s+\\(?([0-9]{4})\\)?", Pattern.CASE_INSENSITIVE);
    Matcher m = p.matcher(title);
    if (m.find()) {
      return new Pair<>(m.group(1), m.group(2));
    }

    return new Pair<>(title, null);
  }

  /**
   * Try to clean the NFO(XML) content with JTidy.
   * 
   * @param sourceNfoContent
   *          the XML content to be cleaned
   * @return the cleaned XML content (or the source, if any Exceptions occur)
   */
  public static String cleanNfo(String sourceNfoContent) {
    try {
      Tidy tidy = new Tidy();
      tidy.setInputEncoding("UTF-8");
      tidy.setOutputEncoding("UTF-8");
      tidy.setWraplen(Integer.MAX_VALUE);
      tidy.setXmlOut(true);
      tidy.setSmartIndent(true);
      tidy.setXmlTags(true);
      tidy.setMakeClean(true);
      tidy.setForceOutput(true);
      tidy.setQuiet(true);
      tidy.setShowWarnings(false);
      StringReader in = new StringReader(sourceNfoContent);
      StringWriter out = new StringWriter();
      tidy.parse(in, out);

      return out.toString();
    }
    catch (Exception ignored) {
    }
    return sourceNfoContent;
  }

  /**
   * for all strings, return the "cleanest" one detected by rateCleanness()
   * 
   * @param names
   *          strings
   * @return cleanest one
   */
  public static ParserInfo getCleanerString(String... names) {
    ArrayList info = new ArrayList<>(1);
    ParserInfo ret = null;
    int rate = -10000;

    for (String s : names) {
      info.add(new ParserInfo(s));
    }
    for (ParserInfo i : info) {
      int tmp = ParserUtils.rateCleanness(i);
      if (tmp > rate) {
        ret = i;
        rate = tmp;
      }
    }

    return ret;
  }

  /**
   * returns a count how "clean" a string is

   * CamelCase name with space as delimiter should get a higher value...

   * 
   * @param info
   *          the info to rate
   * @return number, the higher, the better
   */
  public static int rateCleanness(ParserInfo info) {
    if (info.clean.isEmpty()) {
      return -1;
    }
    int rate = 0;

    int words = info.clean.split(" ").length; // count words
    int seps = info.clean.split("[_.-]").length - 1; // count other separators
    int uc = info.clean.replaceAll("[^A-Z]", "").length(); // count uppercase
    int lc = info.clean.replaceAll("[A-Z]", "").length(); // count lowercase
    double cleaned = 100 - info.clean.length() * 100 / info.name.length();

    int cc = 0; // count CamelCase
    Pattern pattern = Pattern.compile("[A-Z][a-z]");
    Matcher matcher = pattern.matcher(info.clean);
    while (matcher.find()) {
      cc++;
    }

    // boost CamesCase & cleaned words, rate non-space separators very worse, the lower words the better
    rate = cc * 20 + (10 - words * 2) * 2 + (seps * -20) - info.clean.length() * 2 + (int) cleaned;
    if (!info.year.isEmpty()) {
      // we found a year in string, so boost this specially
      rate += 20;
    }

    LOGGER.trace(info + " - Rate:" + rate + "    PERC:" + cleaned + " LEN:" + info.clean.length() + " WRD:" + words + " UC:" + uc + " LC:" + lc
        + " CC:" + cc + " SEP:" + seps);

    return rate;
  }

  public static class ParserInfo {
    public String name  = "";
    public String year  = "";
    public String clean = "";

    ParserInfo(String name) {
      this.name = name.trim();
      String[] ty = detectCleanMovienameAndYear(this.name);
      this.clean = ty[0];
      this.year = ty[1];
    }

    @Override
    public String toString() {
      return clean + " (" + this.year + ")";
    }
  }
}