All Downloads are FREE. Search and download functionalities are using the official Maven repository.

at.molindo.webtools.crawler.CrawlerTask Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2010 Molindo GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package at.molindo.webtools.crawler;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;

import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpGet;
import org.w3c.tidy.Tidy;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

import at.molindo.utils.io.StreamUtils;

public class CrawlerTask implements Runnable {

	private final Crawler _crawler;
	private final String _urlString;
	private final CrawlerReferrer _referrer;
	private boolean _tidy = true;

	public CrawlerTask(final Crawler crawler, final String url, final CrawlerReferrer referrer, final boolean tidy) {
		_crawler = crawler;
		_urlString = url;
		_referrer = referrer;
		_tidy = tidy;
	}

	public String getUrlString() {
		return _urlString;
	}

	public CrawlerReferrer getReferrer() {
		return _referrer;
	}

	@Override
	public void run() {
		if (Thread.currentThread() instanceof CrawlerThread == false) {
			throw new Error("not a cralwer thread");
		}

		final CrawlerResult sr = new CrawlerResult();
		sr.setUrl(_urlString);
		if (_referrer != null) {
			sr.getReferrers().add(_referrer);
		}

		final HttpGet get = new HttpGet(_urlString);
		// get.setFollowRedirects(false);

		try {
			final long start = System.currentTimeMillis();

			final HttpResponse response = ((CrawlerThread) Thread.currentThread()).getClient().execute(get);

			sr.setStatus(response.getStatusLine().getStatusCode());
			sr.setTime((int) (System.currentTimeMillis() - start));

			final Header[] contentTypeHeader = response.getHeaders("Content-Type");
			sr.setContentType(contentTypeHeader == null || contentTypeHeader.length == 0 ? null : contentTypeHeader[0]
					.getValue());

			final String encoding = response.getEntity().getContentEncoding() == null ? null : response.getEntity()
					.getContentEncoding().getValue();

			final Object content = consumeContent(response.getEntity().getContent(), sr.getContentType(), response
					.getEntity().getContentLength(), encoding);

			if (sr.getStatus() / 100 == 3) {
				String redirectLocation;
				final Header[] locationHeader = response.getHeaders("location");
				if (locationHeader != null && locationHeader.length > 0) {
					redirectLocation = locationHeader[0].getValue();
					if (redirectLocation.startsWith("/")) {
						redirectLocation = _crawler._host + redirectLocation.substring(1);
					}
					_crawler.queue(redirectLocation, new CrawlerReferrer(_urlString, response.getStatusLine()
							.getReasonPhrase() + ": " + _referrer));
				} else {
					System.err.println("redirect without location from " + _urlString);
				}
			} else if (sr.getStatus() == HttpStatus.SC_OK) {
				if (content instanceof String) {
					sr.setText((String) content);

					if (sr.getContentType().startsWith("text/html")) {
						parseResult(sr.getText());
					}
				}
			}
		} catch (final MalformedURLException e) {
			sr.setErrorMessage(e.getMessage());
			// e.printStackTrace();
		} catch (final IOException e) {
			sr.setErrorMessage(e.getMessage());
			e.printStackTrace();
		} catch (final SAXException e) {
			sr.setErrorMessage(e.getMessage());
			// e.printStackTrace();
		} catch (final Throwable t) {
			t.printStackTrace();
		} finally {
			_crawler.report(sr);
			// response.releaseConnection();
		}

	}

	private Object consumeContent(final InputStream content, String contentType, final long contentLength,
			final String encoding) throws IOException {
		if (contentType == null) {
			contentType = "";
		}

		try {
			if (contentType.startsWith("text/")) {
				final BufferedReader r = new BufferedReader(new InputStreamReader(content, encoding == null ? "utf-8"
						: encoding));

				String line;
				final StringBuilder buf = new StringBuilder();

				while ((line = r.readLine()) != null) {
					buf.append(line).append("\n");
				}
				if (buf.length() > 0) {
					buf.setLength(buf.length() - 1);
				}
				return buf.toString();
			} else {
				final ByteArrayOutputStream out = new ByteArrayOutputStream(contentLength > 0
						&& contentLength <= Integer.MAX_VALUE ? (int) contentLength : 4096);
				StreamUtils.copy(content, out, 4096);
				final byte[] bytes = out.toByteArray();
				out.flush();
				out.close();
				return bytes;
			}
		} finally {
			try {
				content.close();
			} catch (final IOException e) {
				// ignore
			}
		}
	}

	protected void parseResult(final String string) throws SAXException, IOException {

		InputSource inputSource;
		if (_tidy) {
			final Tidy tidy = new Tidy();
			tidy.setXHTML(true);
			tidy.setErrfile("/dev/null");
			final ByteArrayInputStream in = new ByteArrayInputStream(string.getBytes());
			final ByteArrayOutputStream out = new ByteArrayOutputStream();

			tidy.parse(in, out);

			inputSource = new InputSource(new ByteArrayInputStream(out.toByteArray()));
		} else {
			inputSource = new InputSource(new StringReader(string));
		}

		((CrawlerThread) Thread.currentThread()).getParser().parse(inputSource, new DefaultHandler() {

			@Override
			public void startElement(final String uri, final String localName, final String name,
					final Attributes attributes) throws SAXException {

				if ("a".equals(name)) {
					String href = attributes.getValue("href");
					if (href != null) {
						final int anchorIndex = href.lastIndexOf("#");
						if (anchorIndex > 0) {
							href = href.substring(0, anchorIndex);
						} else if (anchorIndex == 0) {
							// anchor on same page: ignore
							return;
						}

						if (href != null) {
							final CrawlerReferrer referrer = new CrawlerReferrer(_urlString, href);
							if (!href.startsWith("http://")) {
								if (href.startsWith("/")) {
									_crawler.queue(_crawler._host + href.substring(1), referrer);
								} else if (!href.startsWith("javascript:") && !href.startsWith("ftp:")
										&& !href.startsWith("mailto:")) {
									String relativeTo = _urlString.substring(0, _urlString.lastIndexOf("/"));
									boolean one = false, two = false;
									while ((two = href.startsWith("../")) || (one = href.startsWith("./"))) {
										if (two) {
											href = href.substring(3);
											relativeTo = relativeTo.substring(0, relativeTo.lastIndexOf("/"));
										} else if (one) {
											href = href.substring(2);
										}
									}

									_crawler.queue(relativeTo + "/" + href, referrer);
								}
							} else if (href.startsWith(_crawler._host)) {
								_crawler.queue(href, referrer);
							}
						}
					}
				}
			}

			@Override
			public InputSource resolveEntity(final String publicId, String systemId) throws IOException, SAXException {
				if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) {
					systemId = getClass().getClassLoader().getResource("xhtml1-transitional.dtd").toString();
				}

				return _crawler.getDtdMemoryCache().resolveEntity(publicId, systemId);
			}
		});
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy