All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.goikosoft.crawler4j.parser.CssParseData Maven / Gradle / Ivy

Go to download

crawler4j: Open Source Web Crawler for Java. Modified by Dario Goikoetxea to add POST capabilities

There is a newer version: 4.5.11
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.goikosoft.crawler4j.parser; import java.io.UnsupportedEncodingException; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.goikosoft.crawler4j.url.URLCanonicalizer; import com.goikosoft.crawler4j.url.WebURL; public class CssParseData extends TextParseData { private Set parseOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set extractedUrls = extractUrlInCssText(this.getTextContent()); final String pagePath = referringPage.getPath(); final String pageUrl = referringPage.getURL(); Set outgoingUrls = new HashSet<>(); for (String url : extractedUrls) { String relative = getLinkRelativeTo(pagePath, url); String absolute = getAbsoluteUrlFrom(URLCanonicalizer.getCanonicalURL(pageUrl), relative); WebURL webURL = new WebURL(); webURL.setURL(absolute); outgoingUrls.add(webURL); } return outgoingUrls; } public void setOutgoingUrls(WebURL referringPage) throws UnsupportedEncodingException { Set outgoingUrls = parseOutgoingUrls(referringPage); this.setOutgoingUrls(outgoingUrls); } private static Set extractUrlInCssText(String input) { Set extractedUrls = new HashSet<>(); if (input == null || input.isEmpty()) { return extractedUrls; } Matcher matcher = pattern.matcher(input); while (matcher.find()) { String url = matcher.group(1); if (url == null) { url = matcher.group(2); } if (url == null) { url = matcher.group(3); } if (url == null || url.startsWith("data:")) { continue; } extractedUrls.add(url); } return extractedUrls; } private static final Pattern pattern = initializePattern(); private static Pattern initializePattern() { return Pattern.compile("url\\(\\s*'([^\\)]+)'\\s*\\)" + // url('...') "|url\\(\\s*\"([^\\)]+)\"\\s*\\)" + // url("...") "|url\\(\\s*([^\\)]+)\\s*\\)" + // url(...) "|\\/\\*(\\*(?!\\/)|[^*])*\\*\\/"); // ignore comments } private static String getAbsoluteUrlFrom(String pageUrl, String linkPath) { String domainUrl = getFullDomainFromUrl(pageUrl); if (linkPath.startsWith("/")) { return domainUrl + linkPath; } return domainUrl + "/" + linkPath; } private static String getLinkRelativeTo(String pagePath, String linkUrl) { if (linkUrl.startsWith("/") && !linkUrl.startsWith("//")) { return linkUrl; } if (linkUrl.startsWith("//")) { linkUrl = "http" + linkUrl; } if (linkUrl.startsWith("http")) { String domainUrl = getPathFromUrl(linkUrl); return domainUrl; } if (linkUrl.startsWith("../")) { String[] parts = pagePath.split("/"); int pos = linkUrl.lastIndexOf("../") + 3; int parents = pos / 3; long diff = parts.length - parents - 1; String absolute = ""; for (int i = 0; i < diff; i++) { String dir = parts[i]; if (!dir.isEmpty()) { absolute = absolute + "/" + dir; } } return absolute + "/" + linkUrl.substring(pos); } String root = getDirsFromUrl(pagePath); return root + linkUrl; } private static String getDirsFromUrl(String urlPath) { int pos = urlPath.lastIndexOf("/") + 1; String root = urlPath.substring(0, pos); return root; } private static String getPathFromUrl(String url) { int pos1 = url.indexOf("//") + 2; // http://subdomain.domain:port/dir/page.ext int pos2 = url.indexOf("/", pos1); String path = url.substring(pos2); return path; } private static String getFullDomainFromUrl(String url) { int pos1 = url.indexOf("//") + 2; // http://subdomain.domain:port/dir/page.ext int pos2 = url.indexOf("/", pos1); String path = url.substring(0, pos2); return path; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy