com.jaeksoft.searchlib.crawler.web.spider.DownloadItem Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.spider;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.CRC32;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.http.Header;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import com.jaeksoft.searchlib.SearchLibException.WrongStatusCodeException;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.StringUtils;
public class DownloadItem {
private URI uri;
private URI redirectLocation = null;
private Long contentLength = null;
private String contentDispositionFilename = null;
private String contentBaseType = null;
private String contentTypeCharset = null;
private String contentEncoding = null;
private String contentLocation = null;
private Long lastModified = null;
private Integer statusCode = null;
private String reasonPhrase = null;
private InputStream contentInputStream = null;
private boolean fromCache = false;
private List headers = null;
private Header[] httpHeaders = null;
public DownloadItem(URI uri) {
this.uri = uri;
}
protected final static String KEY_REDIRECT_LOCATION = "KEY_REDIRECT_LOCATION";
protected final static String KEY_CONTENT_DISPOSITION_FILENAME = "KEY_CONTENT_DISPOSITION_FILENAME";
protected final static String KEY_CONTENT_LENGTH = "KEY_CONTENT_LENGTH";
protected final static String KEY_LAST_MODIFIED = "KEY_LAST_MODIFIED";
protected final static String KEY_CONTENT_BASE_TYPE = "KEY_CONTENT_BASE_TYPE";
protected final static String KEY_CONTENT_TYPE_CHARSET = "KEY_CONTENT_TYPE_CHARSET";
protected final static String KEY_CONTENT_ENCODING = "KEY_CONTENT_ENCODING";
protected final static String KEY_CONTENT_LOCATION = "KEY_CONTENT_LOCATION";
protected final static String KEY_STATUS_CODE = "KEY_STATUS_CODE";
protected final static String KEY_REASON_PHRASE = "KEY_REASON_PHRASE";
protected final static String KEY_HEADERS = "KEY_HEADERS";
public String getMetaAsJson() throws JSONException {
JSONObject json = new JSONObject();
if (redirectLocation != null)
json.put(KEY_REDIRECT_LOCATION, redirectLocation.toASCIIString());
if (contentLength != null)
json.put(KEY_CONTENT_LENGTH, contentLength);
if (lastModified != null)
json.put(KEY_LAST_MODIFIED, lastModified);
if (contentDispositionFilename != null)
json.put(KEY_CONTENT_DISPOSITION_FILENAME,
contentDispositionFilename);
if (contentBaseType != null)
json.put(KEY_CONTENT_BASE_TYPE, contentBaseType);
if (contentTypeCharset != null)
json.put(KEY_CONTENT_TYPE_CHARSET, contentTypeCharset);
if (contentEncoding != null)
json.put(KEY_CONTENT_ENCODING, contentEncoding);
if (contentLocation != null)
json.put(KEY_CONTENT_LOCATION, contentLocation);
if (statusCode != null)
json.put(KEY_STATUS_CODE, statusCode);
if (reasonPhrase != null)
json.put(KEY_REASON_PHRASE, reasonPhrase);
if (headers != null)
json.put(KEY_HEADERS, headers);
return json.toString();
}
public void loadMetaFromJson(org.json.JSONObject json)
throws URISyntaxException, JSONException {
fromCache = true;
if (json.has(KEY_REDIRECT_LOCATION)) {
String s = json.getString(KEY_REDIRECT_LOCATION);
if (s != null)
redirectLocation = new URI(s);
}
if (json.has(KEY_CONTENT_LENGTH))
contentLength = json.getLong(KEY_CONTENT_LENGTH);
if (json.has(KEY_LAST_MODIFIED))
lastModified = json.getLong(KEY_LAST_MODIFIED);
if (json.has(KEY_CONTENT_DISPOSITION_FILENAME))
contentDispositionFilename = json
.getString(KEY_CONTENT_DISPOSITION_FILENAME);
if (json.has(KEY_CONTENT_BASE_TYPE))
contentBaseType = json.getString(KEY_CONTENT_BASE_TYPE);
if (json.has(KEY_CONTENT_TYPE_CHARSET))
contentTypeCharset = json.getString(KEY_CONTENT_TYPE_CHARSET);
if (json.has(KEY_CONTENT_ENCODING))
contentEncoding = json.getString(KEY_CONTENT_ENCODING);
if (json.has(KEY_CONTENT_LOCATION))
contentLocation = json.getString(KEY_CONTENT_LOCATION);
if (json.has(KEY_STATUS_CODE))
statusCode = json.getInt(KEY_STATUS_CODE);
if (json.has(KEY_REASON_PHRASE))
reasonPhrase = json.getString(KEY_REASON_PHRASE);
if (json.has(KEY_HEADERS)) {
headers = new ArrayList();
JSONArray headerJsonArray = json.getJSONArray(KEY_HEADERS);
if (headerJsonArray != null)
for (int i = 0; i < headerJsonArray.length(); i++)
headers.add(headerJsonArray.get(i).toString());
}
}
/**
* @return the redirectLocation
*/
public URI getRedirectLocation() {
return redirectLocation;
}
/**
* @param redirectLocation
* the redirectLocation to set
*/
public void setRedirectLocation(URI redirectLocation) {
this.redirectLocation = redirectLocation;
}
/**
* @return the contentLength
*/
public Long getContentLength() {
return contentLength;
}
/**
* @return the lastModified
*/
public Long getLastModified() {
return lastModified;
}
/**
* @param lastModified
* the lastModified to set
*/
public void setLastModified(Long lastModified) {
this.lastModified = lastModified;
}
/**
* @param contentLength
* the contentLength to set
*/
public void setContentLength(Long contentLength) {
this.contentLength = contentLength;
}
/**
* @return the contentDispositionFilename
*/
public String getContentDispositionFilename() {
return contentDispositionFilename;
}
/**
* @param contentDispositionFilename
* the contentDispositionFilename to set
*/
public void setContentDispositionFilename(String contentDispositionFilename) {
this.contentDispositionFilename = contentDispositionFilename;
}
public String getFileName() throws MalformedURLException {
if (contentDispositionFilename != null)
return contentDispositionFilename;
if (uri == null)
return null;
String urlFile = uri.toURL().getPath();
if (urlFile == null)
return null;
return FilenameUtils.getName(urlFile);
}
/**
* @return the contentBaseType
*/
public String getContentBaseType() {
return contentBaseType;
}
/**
* @param contentBaseType
* the contentBaseType to set
*/
public void setContentBaseType(String contentBaseType) {
this.contentBaseType = contentBaseType;
}
/**
* @return the contentTypeCharset
*/
public String getContentTypeCharset() {
return contentTypeCharset;
}
/**
* @param contentTypeCharset
* the contentTypeCharset to set
*/
public void setContentTypeCharset(String contentTypeCharset) {
this.contentTypeCharset = contentTypeCharset;
}
/**
* @return the contentEncoding
*/
public String getContentEncoding() {
return contentEncoding;
}
/**
* @param contentEncoding
* the contentEncoding to set
*/
public void setContentEncoding(String contentEncoding) {
this.contentEncoding = contentEncoding;
}
/**
* @return the statusCode
*/
public Integer getStatusCode() {
return statusCode;
}
public void checkNoErrorRange(int fromInclusive, int toExclusive)
throws WrongStatusCodeException {
if (statusCode == null)
throw new WrongStatusCodeException("No status code - ", uri);
if (statusCode < fromInclusive || statusCode >= toExclusive)
throw new WrongStatusCodeException("Wrong status code: ",
statusCode, ' ', reasonPhrase, " - ", uri);
}
public void checkNoErrorList(int... validCodes)
throws WrongStatusCodeException {
if (statusCode == null)
throw new WrongStatusCodeException("Wrong status code: ",
statusCode, ' ', reasonPhrase, " - ", uri);
for (int validCode : validCodes)
if (statusCode == validCode)
return;
throw new WrongStatusCodeException("Wrong status code: ", statusCode,
' ', reasonPhrase, " - ", uri);
}
/**
* @param statusCode
* the statusCode to set
*/
public void setStatusCode(Integer statusCode) {
this.statusCode = statusCode;
}
/**
* @return the reasonPhrase
*/
public String getReasonPhrase() {
return reasonPhrase;
}
/**
* @param reasonPhrase
* the reasonPhrase to set
*/
public void setReasonPhrase(String reasonPhrase) {
this.reasonPhrase = reasonPhrase;
}
/**
* @return the contentInputStream
*/
public InputStream getContentInputStream() {
return contentInputStream;
}
/**
* @param contentInputStream
* the inputStream to set
*/
public void setContentInputStream(InputStream contentInputStream) {
this.contentInputStream = contentInputStream;
}
/**
* @return the uri
*/
public URI getUri() {
return uri;
}
/**
* @return the fromCache
*/
public boolean isFromCache() {
return fromCache;
}
public List getHeaders() {
return headers;
}
public void setHeaders(Header[] headers) {
httpHeaders = headers;
if (headers == null)
return;
this.headers = new ArrayList(headers.length);
for (Header header : headers) {
StringBuilder sb = new StringBuilder();
sb.append(header.getName());
sb.append(": ");
sb.append(header.getValue());
this.headers.add(sb.toString());
}
}
public String getFirstHttpHeader(String name) {
if (httpHeaders == null)
return null;
for (Header header : httpHeaders)
if (header.getName().equalsIgnoreCase(name))
return header.getValue();
return null;
}
public String getContentAsString() throws IOException {
if (contentInputStream == null)
return null;
return IOUtils.toString(contentInputStream);
}
/**
* @return the contentLocation
*/
public String getContentLocation() {
return contentLocation;
}
/**
* @param contentLocation
* the contentLocation to set
*/
public void setContentLocation(String contentLocation) {
this.contentLocation = contentLocation;
}
public void writeToFile(File file) throws IOException {
if (contentInputStream == null)
return;
FileOutputStream fos = null;
BufferedOutputStream bos = null;
try {
fos = new FileOutputStream(file);
bos = new BufferedOutputStream(fos);
IOUtils.copy(contentInputStream, bos);
} finally {
IOUtils.close(bos, fos);
}
}
public void writeToZip(ZipArchiveOutputStream zipOutput) throws IOException {
if (contentInputStream == null)
return;
String[] domainParts = StringUtils.split(uri.getHost(), '.');
StringBuilder path = new StringBuilder();
for (int i = domainParts.length - 1; i >= 0; i--) {
path.append(domainParts[i]);
path.append('/');
}
String[] pathParts = StringUtils.split(uri.getPath(), '/');
for (int i = 0; i < pathParts.length - 1; i++) {
if (StringUtils.isEmpty(pathParts[i]))
continue;
path.append(pathParts[i]);
path.append('/');
}
if (contentDispositionFilename != null)
path.append(contentDispositionFilename);
else {
String lastPart = pathParts == null || pathParts.length == 0 ? null
: pathParts[pathParts.length - 1];
if (StringUtils.isEmpty(lastPart))
path.append("index");
else
path.append(lastPart);
}
if (uri.getPath().endsWith("/"))
path.append("/_index");
String query = uri.getQuery();
String fragment = uri.getFragment();
if (!StringUtils.isEmpty(query) || !StringUtils.isEmpty(fragment)) {
CRC32 crc32 = new CRC32();
if (!StringUtils.isEmpty(query))
crc32.update(query.getBytes());
if (!StringUtils.isEmpty(fragment))
crc32.update(fragment.getBytes());
path.append('.');
path.append(crc32.getValue());
}
ZipArchiveEntry zipEntry = new ZipArchiveEntry(path.toString());
zipOutput.putArchiveEntry(zipEntry);
BufferedInputStream bis = null;
byte[] buffer = new byte[65536];
try {
bis = new BufferedInputStream(contentInputStream);
int l;
while ((l = bis.read(buffer)) != -1)
zipOutput.write(buffer, 0, l);
zipOutput.closeArchiveEntry();
} finally {
IOUtils.close(bis);
}
}
}