
at.molindo.webtools.crawler.CrawlerTask Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2010 Molindo GmbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.molindo.webtools.crawler;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.MalformedURLException;
import org.apache.http.Header;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpGet;
import org.w3c.tidy.Tidy;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import at.molindo.utils.io.StreamUtils;
public class CrawlerTask implements Runnable {
private final Crawler _crawler;
private final String _urlString;
private final CrawlerReferrer _referrer;
private boolean _tidy = true;
public CrawlerTask(final Crawler crawler, final String url, final CrawlerReferrer referrer, final boolean tidy) {
_crawler = crawler;
_urlString = url;
_referrer = referrer;
_tidy = tidy;
}
public String getUrlString() {
return _urlString;
}
public CrawlerReferrer getReferrer() {
return _referrer;
}
@Override
public void run() {
if (Thread.currentThread() instanceof CrawlerThread == false) {
throw new Error("not a cralwer thread");
}
final CrawlerResult sr = new CrawlerResult();
sr.setUrl(_urlString);
if (_referrer != null) {
sr.getReferrers().add(_referrer);
}
final HttpGet get = new HttpGet(_urlString);
// get.setFollowRedirects(false);
try {
final long start = System.currentTimeMillis();
final HttpResponse response = ((CrawlerThread) Thread.currentThread()).getClient().execute(get);
sr.setStatus(response.getStatusLine().getStatusCode());
sr.setTime((int) (System.currentTimeMillis() - start));
final Header[] contentTypeHeader = response.getHeaders("Content-Type");
sr.setContentType(contentTypeHeader == null || contentTypeHeader.length == 0 ? null : contentTypeHeader[0]
.getValue());
final String encoding = response.getEntity().getContentEncoding() == null ? null : response.getEntity()
.getContentEncoding().getValue();
final Object content = consumeContent(response.getEntity().getContent(), sr.getContentType(), response
.getEntity().getContentLength(), encoding);
if (sr.getStatus() / 100 == 3) {
String redirectLocation;
final Header[] locationHeader = response.getHeaders("location");
if (locationHeader != null && locationHeader.length > 0) {
redirectLocation = locationHeader[0].getValue();
if (redirectLocation.startsWith("/")) {
redirectLocation = _crawler._host + redirectLocation.substring(1);
}
_crawler.queue(redirectLocation, new CrawlerReferrer(_urlString, response.getStatusLine()
.getReasonPhrase() + ": " + _referrer));
} else {
System.err.println("redirect without location from " + _urlString);
}
} else if (sr.getStatus() == HttpStatus.SC_OK) {
if (content instanceof String) {
sr.setText((String) content);
if (sr.getContentType().startsWith("text/html")) {
parseResult(sr.getText());
}
}
}
} catch (final MalformedURLException e) {
sr.setErrorMessage(e.getMessage());
// e.printStackTrace();
} catch (final IOException e) {
sr.setErrorMessage(e.getMessage());
e.printStackTrace();
} catch (final SAXException e) {
sr.setErrorMessage(e.getMessage());
// e.printStackTrace();
} catch (final Throwable t) {
t.printStackTrace();
} finally {
_crawler.report(sr);
// response.releaseConnection();
}
}
private Object consumeContent(final InputStream content, String contentType, final long contentLength,
final String encoding) throws IOException {
if (contentType == null) {
contentType = "";
}
try {
if (contentType.startsWith("text/")) {
final BufferedReader r = new BufferedReader(new InputStreamReader(content, encoding == null ? "utf-8"
: encoding));
String line;
final StringBuilder buf = new StringBuilder();
while ((line = r.readLine()) != null) {
buf.append(line).append("\n");
}
if (buf.length() > 0) {
buf.setLength(buf.length() - 1);
}
return buf.toString();
} else {
final ByteArrayOutputStream out = new ByteArrayOutputStream(contentLength > 0
&& contentLength <= Integer.MAX_VALUE ? (int) contentLength : 4096);
StreamUtils.copy(content, out, 4096);
final byte[] bytes = out.toByteArray();
out.flush();
out.close();
return bytes;
}
} finally {
try {
content.close();
} catch (final IOException e) {
// ignore
}
}
}
protected void parseResult(final String string) throws SAXException, IOException {
InputSource inputSource;
if (_tidy) {
final Tidy tidy = new Tidy();
tidy.setXHTML(true);
tidy.setErrfile("/dev/null");
final ByteArrayInputStream in = new ByteArrayInputStream(string.getBytes());
final ByteArrayOutputStream out = new ByteArrayOutputStream();
tidy.parse(in, out);
inputSource = new InputSource(new ByteArrayInputStream(out.toByteArray()));
} else {
inputSource = new InputSource(new StringReader(string));
}
((CrawlerThread) Thread.currentThread()).getParser().parse(inputSource, new DefaultHandler() {
@Override
public void startElement(final String uri, final String localName, final String name,
final Attributes attributes) throws SAXException {
if ("a".equals(name)) {
String href = attributes.getValue("href");
if (href != null) {
final int anchorIndex = href.lastIndexOf("#");
if (anchorIndex > 0) {
href = href.substring(0, anchorIndex);
} else if (anchorIndex == 0) {
// anchor on same page: ignore
return;
}
if (href != null) {
final CrawlerReferrer referrer = new CrawlerReferrer(_urlString, href);
if (!href.startsWith("http://")) {
if (href.startsWith("/")) {
_crawler.queue(_crawler._host + href.substring(1), referrer);
} else if (!href.startsWith("javascript:") && !href.startsWith("ftp:")
&& !href.startsWith("mailto:")) {
String relativeTo = _urlString.substring(0, _urlString.lastIndexOf("/"));
boolean one = false, two = false;
while ((two = href.startsWith("../")) || (one = href.startsWith("./"))) {
if (two) {
href = href.substring(3);
relativeTo = relativeTo.substring(0, relativeTo.lastIndexOf("/"));
} else if (one) {
href = href.substring(2);
}
}
_crawler.queue(relativeTo + "/" + href, referrer);
}
} else if (href.startsWith(_crawler._host)) {
_crawler.queue(href, referrer);
}
}
}
}
}
@Override
public InputSource resolveEntity(final String publicId, String systemId) throws IOException, SAXException {
if ("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) {
systemId = getClass().getClassLoader().getResource("xhtml1-transitional.dtd").toString();
}
return _crawler.getDtdMemoryCache().resolveEntity(publicId, systemId);
}
});
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy