All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.tinymediamanager.scraper.util.ParserUtils Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2012 - 2019 Manuel Laggner
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.tinymediamanager.scraper.util;

import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.WordUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tinymediamanager.core.Utils;
import org.tinymediamanager.core.movie.MovieModuleManager;
import org.tinymediamanager.core.tvshow.TvShowModuleManager;
import org.w3c.tidy.Tidy;

/**
 * Various parses methods to get a clean and workable name out of weird filenames
 * 
 * @author Myron Boyle
 */
public class ParserUtils {

  private static final Logger LOGGER     = LoggerFactory.getLogger(ParserUtils.class);
  private static final String DELIMITER  = "[\\[\\](){} _,.-]";

  public static String[]      stopwords  = { "1080", "1080i", "1080p", "2160p", "2160i", "3d", "480i", "480p", "576i", "576p", "720", "720i", "720p",
      "ac3", "ac3ld", "ac3md", "aoe", "atmos", "bd5", "bdrip", "bdrip", "blueray", "bluray", "brrip", "cam", "cd1", "cd2", "cd3", "cd4", "cd5", "cd6",
      "cd7", "cd8", "cd9", "complete", "custom", "dc", "disc1", "disc2", "disc3", "disc4", "disc5", "disc6", "disc7", "disc8", "disc9", "divx",
      "divx5", "dl", "docu", "dsr", "dsrip", "dts", "dtv", "dubbed", "dutch", "dvd", "dvd1", "dvd2", "dvd3", "dvd4", "dvd5", "dvd6", "dvd7", "dvd8",
      "dvd9", "dvdivx", "dvdrip", "dvdscr", "dvdscreener", "emule", "etm", "extended", "fragment", "fs", "fps", "german", "h264", "hd", "hddvd",
      "hdrip", "hdtv", "hdtvrip", "hevc", "hrhd", "hrhdtv", "ind", "internal", "ld", "limited", "ma", "md", "multisubs", "nfo", "nfofix", "ntg",
      "ntsc", "ogg", "ogm", "pal", "pdtv", "proper", "pso", "r3", "r5", "read", "repack", "rerip", "remux", "retail", "roor", "rs", "rsvcd",
      "screener", "se", "subbed", "svcd", "swedish", "tc", "telecine", "telesync", "ts", "truehd", "uncut", "unrated", "vcf", "webdl", "webrip",
      "workprint", "ws", "www", "x264", "xf", "xvid", "xvidvd", "xxx" };

  // clean before splitting (needs delimiter in front!)
  public static String[]      cleanwords = { "24\\.000", "23\\.976", "23\\.98", "24\\.00" };

  /**
   * Tries to get movie name from filename
* 1. splits string using common delimiters ".- ()"
* 2. searches for first occurrence of common stopwords
* 3. if last token is 4 digits, assume year and remove
* 4. everything before the first stopword must be the movie name :p
*
* Deprecated in favor of detectCleanMovienameAndYear (avoid possible dupes) * * @param filename * the filename to get the title from * @return the (hopefully) correct parsed movie name */ @Deprecated public static String detectCleanMoviename(String filename) { return detectCleanMovienameAndYear(filename)[0]; } /** * Tries to get movie name and year from filename
* 1. splits string using common delimiters ".- ()"
* 2. searches for first occurrence of common stopwords
* 3. if last token is 4 digits, assume year and set [1]
* 4. everything before the first stopword must be the movie name :p * * @param filename * the filename to get the title from * @return title/year string (year can be empty) */ @Deprecated public static String[] detectCleanMovienameAndYearOLD(String filename) { String[] ret = { "", "" }; // use trace to not remove logging completely (function called way to often on multi movie dir parsing) LOGGER.trace("Parse filename for movie title: \"" + filename + "\""); if (filename == null || filename.isEmpty()) { LOGGER.warn("Filename empty?!"); return ret; } // remove extension (if found) and split (keep var) String fname = filename.replaceFirst("\\.\\w{2,4}$", ""); // replaces any resolution 1234x1234 (must start with a non-word (else too global) String cleaned = fname.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " "); // replace FPS specific words (must start with a non-word (else too global) for (String cw : cleanwords) { cleaned = cleaned.replaceFirst("(?i)\\W" + cw, " "); } String[] s = cleaned.split(DELIMITER); int firstFoundStopwordPosition = s.length; // iterate over all splitted items for (int i = 0; i < s.length; i++) { // search for stopword position if (s[i] != null && !s[i].isEmpty()) { for (String stop : stopwords) { if (s[i].equalsIgnoreCase(stop)) { s[i] = ""; // delete stopword // remember lowest position, but not lower than 2!!! if (i < firstFoundStopwordPosition && i >= 2) { firstFoundStopwordPosition = i; } } } if (Utils.isValidImdbId(s[i])) { s[i] = ""; // delete imdbId from name } } } // scan backwards - if we have at least 1 token, and the last one is a 4 digit, assume year and remove String year = ""; for (int i = s.length - 1; i > 0; i--) { if (!s[i].isEmpty() && s[i].matches("\\d{4}")) { int currentYear = Calendar.getInstance().get(Calendar.YEAR); int parsedYear = Integer.parseInt(s[i]); if (parsedYear > 1800 && parsedYear < currentYear + 5) { // well, limit the year a bit... LOGGER.trace("removed token '" + s[i] + "'- seems to be year"); year = s[i]; s[i] = ""; break; } } } // rebuild string, respecting bad words StringBuilder name = new StringBuilder(); for (int i = 0; i < firstFoundStopwordPosition; i++) { if (!(s[i] != null && s[i].isEmpty())) { // check for bad words if (!MovieModuleManager.SETTINGS.getBadWord().contains(s[i].toLowerCase(Locale.ROOT))) { String word = s[i]; // roman characters such as "Part Iv" should not be camel-cased switch (word.toUpperCase(Locale.ROOT)) { case "I": case "II": case "III": case "IV": case "V": case "VI": case "VII": case "VIII": case "IX": case "X": name.append(word.toUpperCase(Locale.ROOT)).append(" "); break; default: name.append(WordUtils.capitalizeFully(word)).append(" "); // make CamelCase break; } } } } if (name.length() == 0) { // started with a badword - return name unchanged ret[0] = fname; } else { ret[0] = name.toString().trim(); } ret[1] = year.trim(); LOGGER.trace("Movie title should be: \"" + ret[0] + "\", from " + ret[1]); return ret; } /** * Tries to get movie name and year from filename
* 1. splits string using common delimiters ".- ()"
* 2. searches for first occurrence of common stopwords
* 3. if last token is 4 digits, assume year and set [1]
* 4. everything before the first stopword must be the movie name :p * * @param filename * the filename to get the title from * @return title/year string (year can be empty) */ public static String[] detectCleanMovienameAndYear(String filename) { String[] ret = { "", "" }; // use trace to not remove logging completely (function called way to often on multi movie dir parsing) LOGGER.trace("Parse filename for movie title: \"" + filename + "\""); if (filename == null || filename.isEmpty()) { LOGGER.warn("Filename empty?!"); return ret; } // remove extension (if found) and split (keep var) String fname = filename.replaceFirst("\\.\\w{2,4}$", ""); // replaces any resolution 1234x1234 (must start and end with a non-word (else too global) fname = fname.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " "); // replace FPS specific words (must start with a non-word (else too global) for (String cw : cleanwords) { fname = fname.replaceFirst("(?i)\\W" + cw, " "); } LOGGER.trace("--------------------"); LOGGER.trace("IN: " + fname); // Get [optionals] delimited List opt = new ArrayList<>(); Pattern p = Pattern.compile("\\[(.*?)\\]"); Matcher m = p.matcher(fname); while (m.find()) { LOGGER.trace("OPT: " + m.group(1)); String[] o = StringUtils.split(m.group(1), DELIMITER); opt.addAll(Arrays.asList(o)); fname = fname.replace(m.group(), ""); // remove complete group from name } LOGGER.trace("ARR: " + opt); // detect OTR recordings - at least with that special pattern p = Pattern.compile(".*?(_\\d{2}\\.\\d{2}\\.\\d{2}[_ ]+\\d{2}\\-\\d{2}\\_).*"); // like _12.11.17_20-15_ m = p.matcher(fname); if (m.matches() && m.start(1) > 10) { // start at some later point, not that if pattern is first LOGGER.trace("OTR: " + m.group(1)); fname = fname.substring(0, m.start(1)); } // parse good filename String[] s = StringUtils.split(fname, DELIMITER); if (s.length == 0) { s = opt.toArray(new String[opt.size()]); } int firstFoundStopwordPosition = s.length; // iterate over all splitted items for (int i = 0; i < s.length; i++) { // search for stopword position for (String stop : stopwords) { if (s[i].equalsIgnoreCase(stop)) { s[i] = ""; // delete stopword // remember lowest position, but not lower than 2!!! if (i < firstFoundStopwordPosition && i >= 2) { firstFoundStopwordPosition = i; } } } if (Utils.isValidImdbId(s[i])) { s[i] = ""; // delete imdbId from name } } // scan backwards - if we have at least 1 token, and the last one is a 4 digit, assume year and remove int currentYear = Calendar.getInstance().get(Calendar.YEAR); String year = ""; for (int i = s.length - 1; i > 0; i--) { if (s[i].matches("\\d{4}")) { int parsedYear = Integer.parseInt(s[i]); if (parsedYear > 1800 && parsedYear < currentYear + 5) { // well, limit the year a bit... LOGGER.trace("removed token '" + s[i] + "'- seems to be year"); year = s[i]; s[i] = ""; break; } } } if (year.isEmpty()) { // parse all optional tags for it for (String o : opt) { if (o.matches("\\d{4}")) { int parsedYear = Integer.parseInt(o); if (parsedYear > 1800 && parsedYear < currentYear + 5) { year = String.valueOf(parsedYear); LOGGER.trace("found possible year " + o); } } } } // rebuild string, respecting bad words StringBuilder name = new StringBuilder(); for (int i = 0; i < firstFoundStopwordPosition; i++) { if (!s[i].isEmpty()) { // check for bad words if (!MovieModuleManager.SETTINGS.getBadWord().contains(s[i].toLowerCase(Locale.ROOT))) { String word = s[i]; // roman characters such as "Part Iv" should not be camel-cased switch (word.toUpperCase(Locale.ROOT)) { case "I": case "II": case "III": case "IV": case "V": case "VI": case "VII": case "VIII": case "IX": case "X": name.append(word.toUpperCase(Locale.ROOT)).append(" "); break; default: name.append(WordUtils.capitalizeFully(word)).append(" "); // make CamelCase break; } } } } if (name.length() == 0) { // started with a badword - return name unchanged ret[0] = fname; } else { ret[0] = name.toString().trim(); } ret[1] = year.trim(); LOGGER.trace("Movie title should be: \"" + ret[0] + "\", from " + ret[1]); return ret; } /** * gets IMDB id out of filename * * @param text * a string * @return imdbid or empty */ public static String detectImdbId(String text) { String imdb = ""; if (text != null && !text.isEmpty()) { imdb = StrgUtils.substr(text, ".*(tt\\d{7}).*"); if (imdb.isEmpty()) { imdb = StrgUtils.substr(text, ".*imdb\\.com\\/Title\\?(\\d{7}).*"); if (!imdb.isEmpty()) { imdb = "tt" + imdb; } } } return imdb; } /** * removes some weird number-stopwords like 1080, 720 etc.. to ease the regex parsing for season/episode * * @param filename * the file name to remove the stop- and bad words for * @return the cleaned one */ public static String removeStopwordsAndBadwordsFromTvEpisodeName(String filename) { String before = filename; // replaces any resolution 1234x1234 (must start with a non-word (else too global) filename = filename.replaceFirst("(?i)\\W\\d{3,4}x\\d{3,4}", " "); for (String s : stopwords) { filename = filename.replaceAll("(?i)\\W" + s + "(\\W|$)", " "); // TV stop words must start AND END with a non-word (else too global) or line // end if (LOGGER.isTraceEnabled() && filename.length() != before.length()) { LOGGER.trace("Removed some TV stopword (" + s + "): " + before + " -> " + filename); before = filename; } } // also remove bad words for (String s : TvShowModuleManager.SETTINGS.getBadWord()) { filename = filename.replaceAll("(?i)\\W" + s + "(\\W|$)", " "); // TV bad words must start AND END with a non-word (else too global) or line end if (LOGGER.isTraceEnabled() && filename.length() != before.length()) { LOGGER.trace("Removed some TV bad word (" + s + "): " + before + " -> " + filename); before = filename; } } return filename; } /** * return a 2 element array. 0 = title; 1=date * * parses the title in the format Title YEAR or Title (YEAR) * * @param title * the title * @return the string[] */ public static String[] parseTitle(String title) { String v[] = { "", "" }; if (title == null) return v; Pattern p = Pattern.compile("(.*)\\s+\\(?([0-9]{4})\\)?", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(title); if (m.find()) { v[0] = m.group(1); v[1] = m.group(2); } else { v[0] = title; } return v; } /** * Parses titles if they are in the form Title (Year). The first element is the title, and the second element is the date, both can be null. If the * matcher fails to find the pattern, then the passed in title is set as the first element, which is the title. * * @param title * the title * @return the pair */ public static Pair parseTitleAndDateInBrackets(String title) { if (title == null) return new Pair<>(null, null); Pattern p = Pattern.compile("(.*)\\s+\\(?([0-9]{4})\\)?", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(title); if (m.find()) { return new Pair<>(m.group(1), m.group(2)); } return new Pair<>(title, null); } /** * Try to clean the NFO(XML) content with JTidy. * * @param sourceNfoContent * the XML content to be cleaned * @return the cleaned XML content (or the source, if any Exceptions occur) */ public static String cleanNfo(String sourceNfoContent) { try { Tidy tidy = new Tidy(); tidy.setInputEncoding("UTF-8"); tidy.setOutputEncoding("UTF-8"); tidy.setWraplen(Integer.MAX_VALUE); tidy.setXmlOut(true); tidy.setSmartIndent(true); tidy.setXmlTags(true); tidy.setMakeClean(true); tidy.setForceOutput(true); tidy.setQuiet(true); tidy.setShowWarnings(false); StringReader in = new StringReader(sourceNfoContent); StringWriter out = new StringWriter(); tidy.parse(in, out); return out.toString(); } catch (Exception ignored) { } return sourceNfoContent; } /** * for all strings, return the "cleanest" one detected by rateCleanness() * * @param names * strings * @return cleanest one */ public static ParserInfo getCleanerString(String... names) { ArrayList info = new ArrayList<>(1); ParserInfo ret = null; int rate = -10000; for (String s : names) { info.add(new ParserInfo(s)); } for (ParserInfo i : info) { int tmp = ParserUtils.rateCleanness(i); if (tmp > rate) { ret = i; rate = tmp; } } return ret; } /** * returns a count how "clean" a string is
* CamelCase name with space as delimiter should get a higher value...
* * @param info * the info to rate * @return number, the higher, the better */ public static int rateCleanness(ParserInfo info) { if (info.clean.isEmpty()) { return -1; } int rate = 0; int words = info.clean.split(" ").length; // count words int seps = info.clean.split("[_.-]").length - 1; // count other separators int uc = info.clean.replaceAll("[^A-Z]", "").length(); // count uppercase int lc = info.clean.replaceAll("[A-Z]", "").length(); // count lowercase double cleaned = 100 - info.clean.length() * 100 / info.name.length(); int cc = 0; // count CamelCase Pattern pattern = Pattern.compile("[A-Z][a-z]"); Matcher matcher = pattern.matcher(info.clean); while (matcher.find()) { cc++; } // boost CamesCase & cleaned words, rate non-space separators very worse, the lower words the better rate = cc * 20 + (10 - words * 2) * 2 + (seps * -20) - info.clean.length() * 2 + (int) cleaned; if (!info.year.isEmpty()) { // we found a year in string, so boost this specially rate += 20; } LOGGER.trace(info + " - Rate:" + rate + " PERC:" + cleaned + " LEN:" + info.clean.length() + " WRD:" + words + " UC:" + uc + " LC:" + lc + " CC:" + cc + " SEP:" + seps); return rate; } public static class ParserInfo { public String name = ""; public String year = ""; public String clean = ""; ParserInfo(String name) { this.name = name.trim(); String[] ty = detectCleanMovienameAndYear(this.name); this.clean = ty[0]; this.year = ty[1]; } @Override public String toString() { return clean + " (" + this.year + ")"; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy