All Downloads are FREE. Search and download functionalities are using the official Maven repository.

us.codecraft.webmagic.utils.UrlUtils Maven / Gradle / Ivy

There is a newer version: 1.0.2
Show newest version
package us.codecraft.webmagic.utils;

import org.apache.commons.lang3.StringUtils;
import us.codecraft.webmagic.Request;

import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * url and html utils.
 *
 * @author [email protected] 
* @since 0.1.0 */ public class UrlUtils { /** * canonicalizeUrl *
* Borrowed from Jsoup. * * @param url url * @param refer refer * @return canonicalizeUrl */ public static String canonicalizeUrl(String url, String refer) { URL base; try { try { base = new URL(refer); } catch (MalformedURLException e) { // the base is unsuitable, but the attribute may be abs on its own, so try that URL abs = new URL(refer); return abs.toExternalForm(); } // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired if (url.startsWith("?")) url = base.getPath() + url; URL abs = new URL(base, url); return encodeIllegalCharacterInUrl(abs.toExternalForm()); } catch (MalformedURLException e) { return ""; } } /** * * @param url url * @return new url */ public static String encodeIllegalCharacterInUrl(String url) { //TODO more charator support return url.replace(" ", "%20"); } public static String getHost(String url) { String host = url; int i = StringUtils.ordinalIndexOf(url, "/", 3); if (i > 0) { host = StringUtils.substring(url, 0, i); } return host; } private static Pattern patternForProtocal = Pattern.compile("[\\w]+://"); public static String removeProtocol(String url) { return patternForProtocal.matcher(url).replaceAll(""); } public static String getDomain(String url) { String domain = removeProtocol(url); int i = StringUtils.indexOf(domain, "/", 1); if (i > 0) { domain = StringUtils.substring(domain, 0, i); } return domain; } /** * allow blank space in quote */ private static Pattern patternForHrefWithQuote = Pattern.compile("(]*href=)[\"']([^\"'<>]*)[\"']", Pattern.CASE_INSENSITIVE); /** * disallow blank space without quote */ private static Pattern patternForHrefWithoutQuote = Pattern.compile("(]*href=)([^\"'<>\\s]+)", Pattern.CASE_INSENSITIVE); public static String fixAllRelativeHrefs(String html, String url) { html = replaceByPattern(html, url, patternForHrefWithQuote); html = replaceByPattern(html, url, patternForHrefWithoutQuote); return html; } public static String replaceByPattern(String html, String url, Pattern pattern) { StringBuilder stringBuilder = new StringBuilder(); Matcher matcher = pattern.matcher(html); int lastEnd = 0; boolean modified = false; while (matcher.find()) { modified = true; stringBuilder.append(StringUtils.substring(html, lastEnd, matcher.start())); stringBuilder.append(matcher.group(1)); stringBuilder.append("\"").append(canonicalizeUrl(matcher.group(2), url)).append("\""); lastEnd = matcher.end(); } if (!modified) { return html; } stringBuilder.append(StringUtils.substring(html, lastEnd)); return stringBuilder.toString(); } public static List convertToRequests(Collection urls) { List requestList = new ArrayList(urls.size()); for (String url : urls) { requestList.add(new Request(url)); } return requestList; } public static List convertToUrls(Collection requests) { List urlList = new ArrayList(requests.size()); for (Request request : requests) { urlList.add(request.getUrl()); } return urlList; } private static final Pattern patternForCharset = Pattern.compile("charset\\s*=\\s*['\"]*([^\\s;'\"]*)"); public static String getCharset(String contentType) { Matcher matcher = patternForCharset.matcher(contentType); if (matcher.find()) { String charset = matcher.group(1); if (Charset.isSupported(charset)) { return charset; } } return null; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy