All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.web.spider.DownloadItem Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.web.spider;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.CRC32;

import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.http.Header;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.StringUtils;

public class DownloadItem {

	private URI uri;
	private URI redirectLocation = null;
	private Long contentLength = null;
	private String contentDispositionFilename = null;
	private String contentBaseType = null;
	private String contentTypeCharset = null;
	private String contentEncoding = null;
	private String contentLocation = null;
	private Long lastModified = null;
	private Integer statusCode = null;
	private String reasonPhrase = null;
	private InputStream contentInputStream = null;
	private boolean fromCache = false;
	private List headers = null;
	private Header[] httpHeaders = null;

	public DownloadItem(URI uri) {
		this.uri = uri;
	}

	protected final static String KEY_REDIRECT_LOCATION = "KEY_REDIRECT_LOCATION";
	protected final static String KEY_CONTENT_DISPOSITION_FILENAME = "KEY_CONTENT_DISPOSITION_FILENAME";
	protected final static String KEY_CONTENT_LENGTH = "KEY_CONTENT_LENGTH";
	protected final static String KEY_LAST_MODIFIED = "KEY_LAST_MODIFIED";
	protected final static String KEY_CONTENT_BASE_TYPE = "KEY_CONTENT_BASE_TYPE";
	protected final static String KEY_CONTENT_TYPE_CHARSET = "KEY_CONTENT_TYPE_CHARSET";
	protected final static String KEY_CONTENT_ENCODING = "KEY_CONTENT_ENCODING";
	protected final static String KEY_CONTENT_LOCATION = "KEY_CONTENT_LOCATION";
	protected final static String KEY_STATUS_CODE = "KEY_STATUS_CODE";
	protected final static String KEY_REASON_PHRASE = "KEY_REASON_PHRASE";
	protected final static String KEY_HEADERS = "KEY_HEADERS";

	public String getMetaAsJson() throws JSONException {
		JSONObject json = new JSONObject();

		if (redirectLocation != null)
			json.put(KEY_REDIRECT_LOCATION, redirectLocation.toASCIIString());

		if (contentLength != null)
			json.put(KEY_CONTENT_LENGTH, contentLength);

		if (lastModified != null)
			json.put(KEY_LAST_MODIFIED, lastModified);

		if (contentDispositionFilename != null)
			json.put(KEY_CONTENT_DISPOSITION_FILENAME,
					contentDispositionFilename);

		if (contentBaseType != null)
			json.put(KEY_CONTENT_BASE_TYPE, contentBaseType);

		if (contentTypeCharset != null)
			json.put(KEY_CONTENT_TYPE_CHARSET, contentTypeCharset);

		if (contentEncoding != null)
			json.put(KEY_CONTENT_ENCODING, contentEncoding);

		if (contentLocation != null)
			json.put(KEY_CONTENT_LOCATION, contentLocation);

		if (statusCode != null)
			json.put(KEY_STATUS_CODE, statusCode);

		if (reasonPhrase != null)
			json.put(KEY_REASON_PHRASE, reasonPhrase);

		if (headers != null)
			json.put(KEY_HEADERS, headers);

		return json.toString();
	}

	public void loadMetaFromJson(org.json.JSONObject json)
			throws URISyntaxException, JSONException {

		fromCache = true;

		if (json.has(KEY_REDIRECT_LOCATION)) {
			String s = json.getString(KEY_REDIRECT_LOCATION);
			if (s != null)
				redirectLocation = new URI(s);
		}
		if (json.has(KEY_CONTENT_LENGTH))
			contentLength = json.getLong(KEY_CONTENT_LENGTH);

		if (json.has(KEY_LAST_MODIFIED))
			lastModified = json.getLong(KEY_LAST_MODIFIED);

		if (json.has(KEY_CONTENT_DISPOSITION_FILENAME))
			contentDispositionFilename = json
					.getString(KEY_CONTENT_DISPOSITION_FILENAME);

		if (json.has(KEY_CONTENT_BASE_TYPE))
			contentBaseType = json.getString(KEY_CONTENT_BASE_TYPE);

		if (json.has(KEY_CONTENT_TYPE_CHARSET))
			contentTypeCharset = json.getString(KEY_CONTENT_TYPE_CHARSET);

		if (json.has(KEY_CONTENT_ENCODING))
			contentEncoding = json.getString(KEY_CONTENT_ENCODING);

		if (json.has(KEY_CONTENT_LOCATION))
			contentLocation = json.getString(KEY_CONTENT_LOCATION);

		if (json.has(KEY_STATUS_CODE))
			statusCode = json.getInt(KEY_STATUS_CODE);

		if (json.has(KEY_REASON_PHRASE))
			reasonPhrase = json.getString(KEY_REASON_PHRASE);

		if (json.has(KEY_HEADERS)) {
			headers = new ArrayList();
			JSONArray headerJsonArray = json.getJSONArray(KEY_HEADERS);
			if (headerJsonArray != null)
				for (int i = 0; i < headerJsonArray.length(); i++)
					headers.add(headerJsonArray.get(i).toString());
		}
	}

	/**
	 * @return the redirectLocation
	 */
	public URI getRedirectLocation() {
		return redirectLocation;
	}

	/**
	 * @param redirectLocation
	 *            the redirectLocation to set
	 */
	public void setRedirectLocation(URI redirectLocation) {
		this.redirectLocation = redirectLocation;
	}

	/**
	 * @return the contentLength
	 */
	public Long getContentLength() {
		return contentLength;
	}

	/**
	 * @return the lastModified
	 */
	public Long getLastModified() {
		return lastModified;
	}

	/**
	 * @param lastModified
	 *            the lastModified to set
	 */
	public void setLastModified(Long lastModified) {
		this.lastModified = lastModified;
	}

	/**
	 * @param contentLength
	 *            the contentLength to set
	 */
	public void setContentLength(Long contentLength) {
		this.contentLength = contentLength;
	}

	/**
	 * @return the contentDispositionFilename
	 */
	public String getContentDispositionFilename() {
		return contentDispositionFilename;
	}

	/**
	 * @param contentDispositionFilename
	 *            the contentDispositionFilename to set
	 */
	public void setContentDispositionFilename(String contentDispositionFilename) {
		this.contentDispositionFilename = contentDispositionFilename;
	}

	public String getFileName() throws MalformedURLException {
		if (contentDispositionFilename != null)
			return contentDispositionFilename;
		if (uri == null)
			return null;
		String urlFile = uri.toURL().getPath();
		if (urlFile == null)
			return null;
		return FilenameUtils.getName(urlFile);
	}

	/**
	 * @return the contentBaseType
	 */
	public String getContentBaseType() {
		return contentBaseType;
	}

	/**
	 * @param contentBaseType
	 *            the contentBaseType to set
	 */
	public void setContentBaseType(String contentBaseType) {
		this.contentBaseType = contentBaseType;
	}

	/**
	 * @return the contentTypeCharset
	 */
	public String getContentTypeCharset() {
		return contentTypeCharset;
	}

	/**
	 * @param contentTypeCharset
	 *            the contentTypeCharset to set
	 */
	public void setContentTypeCharset(String contentTypeCharset) {
		this.contentTypeCharset = contentTypeCharset;
	}

	/**
	 * @return the contentEncoding
	 */
	public String getContentEncoding() {
		return contentEncoding;
	}

	/**
	 * @param contentEncoding
	 *            the contentEncoding to set
	 */
	public void setContentEncoding(String contentEncoding) {
		this.contentEncoding = contentEncoding;
	}

	/**
	 * @return the statusCode
	 */
	public Integer getStatusCode() {
		return statusCode;
	}

	public void checkNoErrorRange(int fromInclusive, int toExclusive)
			throws WrongStatusCodeException {
		if (statusCode == null)
			throw new WrongStatusCodeException("No status code - ", uri);
		if (statusCode < fromInclusive || statusCode >= toExclusive)
			throw new WrongStatusCodeException("Wrong status code: ",
					statusCode, ' ', reasonPhrase, " - ", uri);
	}

	public void checkNoErrorList(int... validCodes)
			throws WrongStatusCodeException {
		if (statusCode == null)
			throw new WrongStatusCodeException("Wrong status code: ",
					statusCode, ' ', reasonPhrase, " - ", uri);
		for (int validCode : validCodes)
			if (statusCode == validCode)
				return;
		throw new WrongStatusCodeException("Wrong status code: ", statusCode,
				' ', reasonPhrase, " - ", uri);
	}

	/**
	 * @param statusCode
	 *            the statusCode to set
	 */
	public void setStatusCode(Integer statusCode) {
		this.statusCode = statusCode;
	}

	/**
	 * @return the reasonPhrase
	 */
	public String getReasonPhrase() {
		return reasonPhrase;
	}

	/**
	 * @param reasonPhrase
	 *            the reasonPhrase to set
	 */
	public void setReasonPhrase(String reasonPhrase) {
		this.reasonPhrase = reasonPhrase;
	}

	/**
	 * @return the contentInputStream
	 */
	public InputStream getContentInputStream() {
		return contentInputStream;
	}

	/**
	 * @param contentInputStream
	 *            the inputStream to set
	 */
	public void setContentInputStream(InputStream contentInputStream) {
		this.contentInputStream = contentInputStream;
	}

	/**
	 * @return the uri
	 */
	public URI getUri() {
		return uri;
	}

	/**
	 * @return the fromCache
	 */
	public boolean isFromCache() {
		return fromCache;
	}

	public List getHeaders() {
		return headers;
	}

	public void setHeaders(Header[] headers) {
		httpHeaders = headers;
		if (headers == null)
			return;
		this.headers = new ArrayList(headers.length);
		for (Header header : headers) {
			StringBuilder sb = new StringBuilder();
			sb.append(header.getName());
			sb.append(": ");
			sb.append(header.getValue());
			this.headers.add(sb.toString());
		}
	}

	public String getFirstHttpHeader(String name) {
		if (httpHeaders == null)
			return null;
		for (Header header : httpHeaders)
			if (header.getName().equalsIgnoreCase(name))
				return header.getValue();
		return null;
	}

	public String getContentAsString() throws IOException {
		if (contentInputStream == null)
			return null;
		return IOUtils.toString(contentInputStream);
	}

	/**
	 * @return the contentLocation
	 */
	public String getContentLocation() {
		return contentLocation;
	}

	/**
	 * @param contentLocation
	 *            the contentLocation to set
	 */
	public void setContentLocation(String contentLocation) {
		this.contentLocation = contentLocation;
	}

	public void writeToFile(File file) throws IOException {
		if (contentInputStream == null)
			return;
		FileOutputStream fos = null;
		BufferedOutputStream bos = null;
		try {
			fos = new FileOutputStream(file);
			bos = new BufferedOutputStream(fos);
			IOUtils.copy(contentInputStream, bos);
		} finally {
			IOUtils.close(bos, fos);
		}
	}

	public void writeToZip(ZipArchiveOutputStream zipOutput) throws IOException {
		if (contentInputStream == null)
			return;
		String[] domainParts = StringUtils.split(uri.getHost(), '.');
		StringBuilder path = new StringBuilder();
		for (int i = domainParts.length - 1; i >= 0; i--) {
			path.append(domainParts[i]);
			path.append('/');
		}
		String[] pathParts = StringUtils.split(uri.getPath(), '/');
		for (int i = 0; i < pathParts.length - 1; i++) {
			if (StringUtils.isEmpty(pathParts[i]))
				continue;
			path.append(pathParts[i]);
			path.append('/');
		}
		if (contentDispositionFilename != null)
			path.append(contentDispositionFilename);
		else {
			String lastPart = pathParts == null || pathParts.length == 0 ? null
					: pathParts[pathParts.length - 1];
			if (StringUtils.isEmpty(lastPart))
				path.append("index");
			else
				path.append(lastPart);
		}
		if (uri.getPath().endsWith("/"))
			path.append("/_index");
		String query = uri.getQuery();
		String fragment = uri.getFragment();
		if (!StringUtils.isEmpty(query) || !StringUtils.isEmpty(fragment)) {
			CRC32 crc32 = new CRC32();
			if (!StringUtils.isEmpty(query))
				crc32.update(query.getBytes());
			if (!StringUtils.isEmpty(fragment))
				crc32.update(fragment.getBytes());
			path.append('.');
			path.append(crc32.getValue());
		}
		ZipArchiveEntry zipEntry = new ZipArchiveEntry(path.toString());
		zipOutput.putArchiveEntry(zipEntry);
		BufferedInputStream bis = null;
		byte[] buffer = new byte[65536];
		try {
			bis = new BufferedInputStream(contentInputStream);
			int l;
			while ((l = bis.read(buffer)) != -1)
				zipOutput.write(buffer, 0, l);
			zipOutput.closeArchiveEntry();
		} finally {
			IOUtils.close(bis);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy