org.tinymediamanager.scraper.util.UrlUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of api-scraper Show documentation
Show all versions of api-scraper Show documentation
API for tinyMediaManager scrapers
/*
* Copyright 2012 - 2019 Manuel Laggner
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.tinymediamanager.scraper.util;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.util.List;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.LocaleUtils;
import org.apache.commons.lang3.SystemUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.tinymediamanager.scraper.http.InMemoryCachedUrl;
import org.tinymediamanager.scraper.http.OnDiskCachedUrl;
import org.tinymediamanager.scraper.http.Url;
/**
* The class UrlUtil. This class is used for Url related tasks
*
* @author Manuel Laggner / Myron Boyle
* @since 1.0
*/
public class UrlUtil {
private static final Logger LOGGER = LoggerFactory.getLogger(UrlUtil.class);
public static final String UTF_8 = "UTF-8";
public enum UrlImpl {
DIRECT,
IN_MEMORY,
ON_DISK
}
private UrlUtil() {
// hide public constructor for utility classes
}
/**
* Casts url string to URI, and does the correct encoding (rfc2396) of query string ONLY (eg "|" character). URLEncoder encodes everything which
* might break commons.http
*
* @param url
* the url as string
* @return URI object
* @throws URISyntaxException
* if url could not be parsed / invalid
*/
public static URI getURIEncoded(String url) throws URISyntaxException {
String[] trArr = url.split("://");
return new URI(trArr[0], "//" + trArr[1], null);
}
/**
* Returns file extension from url.
*
* @param url
* the url
* @return file extension or empty string
* @throws URISyntaxException
* if url is not valid
*/
public static String getFileExtension(String url) throws URISyntaxException {
String ext = getURIEncoded(url).getPath();
if (ext == null || ext.isEmpty() || !ext.contains(".")) {
LOGGER.warn("Url " + url + " has no extension!");
return "";
}
else {
ext = getFileNameArray(url)[1];
return ext;
}
}
/**
* gets the BaseName (w/o extension) of an URL (better than commons-io)
*
* @param url
* the to get the base name from
* @return BaseName
*/
public static String getBasename(String url) {
return getFileNameArray(url)[0];
}
/**
* gets the Extension of an URL (better than commons-io)
*
* @param url
* the to get the extension from
* @return BaseName
*/
public static String getExtension(String url) {
return getFileNameArray(url)[1];
}
/**
* gets the FileName (with extension) of an URL (better than commons-io)
*
* @param url
* the to get the file name from
* @return BaseName
*/
public static String getFilename(String url) {
return getFileNameArray(url)[2];
}
/**
* Returns the the entire Url Path except the filename, like doing a basedir on a filename.
*
* @param url
* the url
* @return the base url
*/
public static String getBaseUrl(String url) {
String path = getPathName(url);
if (path != null && path.contains("/")) {
path = path.substring(0, path.lastIndexOf('/'));
}
return getDomainUrl(url) + path;
}
/**
* Gets the domain url.
*
* @param url
* the url
* @return the domain url
*/
public static String getDomainUrl(String url) {
URL u;
try {
u = new URL(url);
return String.format("%s://%s/", u.getProtocol(), u.getHost());
}
catch (MalformedURLException e) {
LOGGER.error("Failed to get domain url for: " + url);
}
return null;
}
/**
* Join url path.
*
* @param baseUrl
* the base url
* @param path
* the path
* @return the string
*/
public static String joinUrlPath(String baseUrl, String path) {
StringBuffer sb = new StringBuffer(baseUrl);
if (baseUrl.endsWith("/") && path.startsWith("/")) {
path = path.substring(1);
}
sb.append(path);
return sb.toString();
}
/**
* Gets the path name.
*
* @param url
* the url
* @return the path name
*/
public static String getPathName(String url) {
URL u;
try {
u = new URL(url);
return u.getPath();
}
catch (MalformedURLException e) {
LOGGER.error("getPathName() Failed! " + url, e);
}
return null;
}
/**
* get the correct name/extension/filename of url (even with parameters! - commons-io CANNOT)
*
* @param url
* the url
* @return basename/ext/filename array
*/
public static String[] getFileNameArray(String url) {
String[] ret = new String[] { "", "", "" };
// URL:
// "http://photosaaaaa.net/photos-ak-snc1/v315/224/13/659629384/s659629384_752969_4472.jpg?asdf=jklo"
String filename = "";
String path = "";
// PATH:
// /photos-ak-snc1/v315/224/13/659629384/s659629384_752969_4472.jpg?asdf=jklo
try {
url = getURIEncoded(url).toString();
path = new URL(url).getPath();
}
catch (Exception e) {
return ret;
}
// Checks for both forward and/or backslash
// NOTE:**While backslashes are not supported in URL's
// most browsers will autoreplace them with forward slashes
// So technically if you're parsing an html page you could run into
// a backslash , so i'm accounting for them here;
String[] pathContents = path.split("[\\\\/]");
if (pathContents != null) {
int pathContentsLength = pathContents.length;
// lastPart: s659629384_752969_4472.jpg
String lastPart = pathContents[pathContentsLength - 1];
String[] lastPartContents = lastPart.split("\\.");
if (lastPartContents != null && lastPartContents.length > 1) {
int lastPartContentLength = lastPartContents.length;
// filenames can contain . , so we assume everything before
// the last . is the name, everything after the last . is the
// extension
String name = "";
for (int i = 0; i < lastPartContentLength; i++) {
if (i < (lastPartContents.length - 1)) {
name += lastPartContents[i];
if (i < (lastPartContentLength - 2)) {
name += ".";
}
}
}
String extension = lastPartContents[lastPartContentLength - 1];
filename = name + "." + extension;
ret = new String[] { name, extension, filename };
}
else {
// no extension (eg youtube)
String name = lastPartContents[0];
ret = new String[] { name, "", name };
}
}
return ret;
}
/**
* Encode.
*
* @param data
* the data
* @return the string
*/
@SuppressWarnings("deprecation")
public static String encode(String data) {
if (data == null) {
return "";
}
try {
return URLEncoder.encode(data, "UTF-8");
}
catch (UnsupportedEncodingException e) {
LOGGER.warn("Failed to url encode data: " + data + " as UTF-8; will try again using default encoding", e);
return URLEncoder.encode(data);
}
}
/**
* generates a valid user-agent
* something like:
* Mozilla/5.0 (Windows; Windows NT 6.1; Windows 7 6.1; U; amd64; de-DE; rv:57.0) Gecko/20100101 Firefox/57.0
* but with correct OS and language values
*/
public static String generateUA() {
return generateUA(Locale.getDefault().getLanguage());
}
/**
* generates a valid user-agent
* something like:
* Mozilla/5.0 (Windows; Windows NT 6.1; Windows 7 6.1; U; amd64; de-DE; rv:57.0) Gecko/20100101 Firefox/57.0
* but with correct OS and language values
*
* @param language
* take the given language rather than the systems default
*/
public static String generateUA(String language) {
// this is due to the fact, that the OS is not correctly recognized (eg
// Mobile FirefoxOS, where it isn't)
String hardcodeOS = "";
if (SystemUtils.IS_OS_WINDOWS) {
hardcodeOS = "Windows; Windows NT " + System.getProperty("os.version");
}
else if (SystemUtils.IS_OS_MAC) {
hardcodeOS = "Macintosh";
}
else if (SystemUtils.IS_OS_LINUX) {
hardcodeOS = "X11";
}
else {
hardcodeOS = System.getProperty("os.name");
}
Locale l = getLocaleFromLanguage(language);
// @formatter:off
return String.format("Mozilla/5.0 (%1$s; %2$s %3$s; U; %4$s; %5$s-%6$s; rv:57.0) Gecko/20100101 Firefox/57.0", hardcodeOS,
System.getProperty("os.name", ""), System.getProperty("os.version", ""), System.getProperty("os.arch", ""), l.getLanguage(), l.getCountry());
// @formatter:on
}
/**
* Gets a correct Locale (language + country) from given language.
*
* @param language
* as 2char
* @return Locale
*/
public static Locale getLocaleFromLanguage(String language) {
if (language == null || language.isEmpty()) {
return null;
}
if (language.equalsIgnoreCase("en")) {
return new Locale("en", "US"); // don't mess around; at least fixtate this
}
Locale l = null;
List countries = LocaleUtils.countriesByLanguage(language.toLowerCase(Locale.ROOT));
for (Locale locale : countries) {
if (locale.getCountry().equalsIgnoreCase(language)) {
// map to main countries; de->de_DE (and not de_CH)
l = locale;
}
}
if (l == null && countries.size() > 0) {
// well, take the first one
l = countries.get(0);
}
return l;
}
/**
* fetch the given url and parse it into a {@link Document}
*
* @param urlAsString
* the url as string
* @return the parsed {@link Document}
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static Document parseDocumentFromUrl(String urlAsString) throws IOException, InterruptedException {
return parseDocumentFromUrl(urlAsString, UrlImpl.DIRECT);
}
/**
* fetch the given url with the desired url type and parse it into a {@link Document}
*
* @param urlAsString
* the url as string
* @param urlImpl
* the desired url type
* @return the parsed {@link Document}
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static Document parseDocumentFromUrl(String urlAsString, UrlImpl urlImpl) throws IOException, InterruptedException {
try (InputStream in = getUrlFromUrlImpl(urlAsString, urlImpl).getInputStream()) {
return Jsoup.parse(in, UrlUtil.UTF_8, "");
}
}
/**
* fetch the given url and parse it into a {@link String}
*
* @param urlAsString
* the url as string
* @return the parsed {@link String}
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static String getStringFromUrl(String urlAsString) throws IOException, InterruptedException {
return getStringFromUrl(urlAsString, UrlImpl.DIRECT);
}
/**
* fetch the given url and parse it into a {@link String}
*
* @param urlAsString
* the url as string
* @param urlImpl
* the desired url type
* @return the parsed {@link String}
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static String getStringFromUrl(String urlAsString, UrlImpl urlImpl) throws IOException, InterruptedException {
try (InputStream is = getUrlFromUrlImpl(urlAsString, urlImpl).getInputStream()) {
return IOUtils.toString(is, UrlUtil.UTF_8);
}
}
/**
* fetch the given url and return the content in a byte array
*
* @param urlAsString
* the url as string
* @return a byte array of the content
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static byte[] getByteArrayFromUrl(String urlAsString) throws IOException, InterruptedException {
return getByteArrayFromUrl(urlAsString, UrlImpl.DIRECT);
}
/**
* fetch the given url and return the content in a byte array
*
* @param urlAsString
* the url as string
* @param urlImpl
* the desired url type
* @return a byte array of the content
* @throws IOException
* Indicates that an {@link IOException} occurred
* @throws InterruptedException
* Indicates that this thread as been interrupted
*/
public static byte[] getByteArrayFromUrl(String urlAsString, UrlImpl urlImpl) throws IOException, InterruptedException {
try (InputStream is = getUrlFromUrlImpl(urlAsString, urlImpl).getInputStream()) {
return IOUtils.toByteArray(is);
}
}
/**
* get the desired implementation for an {@link Url}
*
* @param urlAsString
* the url destination
* @param urlImpl
* the desired implementation
* @return an instance of the desired {@link Url} implementation ({@link Url} as default)
* @throws IOException
* Indicates that an {@link IOException} occurred
*/
private static Url getUrlFromUrlImpl(String urlAsString, UrlImpl urlImpl) throws IOException {
switch (urlImpl) {
case IN_MEMORY:
return new InMemoryCachedUrl(urlAsString);
case ON_DISK:
return new OnDiskCachedUrl(urlAsString);
default:
return new Url(urlAsString);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy