
com.jaeksoft.searchlib.crawler.web.database.UrlItem Maven / Gradle / Ivy
Show all versions of opensearchserver Show documentation
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.database;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.crawler.TargetStatus;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.result.ResultDocument;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDecimalFormat;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeSimpleDateFormat;
import com.jaeksoft.searchlib.util.LinkUtils;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class UrlItem {
private String urlString;
private URL url;
private String contentDispositionFilename;
private String contentBaseType;
private String contentTypeCharset;
private Long contentLength;
private String contentEncoding;
private String lang;
private String langMethod;
private String host;
private List subhost;
private Date when;
private RobotsTxtStatus robotsTxtStatus;
private FetchStatus fetchStatus;
private Integer responseCode;
private ParserStatus parserStatus;
private IndexStatus indexStatus;
private int count;
private String md5size;
private Date lastModifiedDate;
private Date contentUpdateDate;
private List outLinks;
private List inLinks;
private String parentUrl;
private String redirectionUrl;
private LinkItem.Origin origin;
private List headers;
private int backlinkCount;
private int depth;
private String urlWhen;
protected UrlItem() {
urlString = null;
url = null;
contentDispositionFilename = null;
contentBaseType = null;
contentTypeCharset = null;
contentLength = null;
contentEncoding = null;
lang = null;
langMethod = null;
host = null;
subhost = null;
outLinks = null;
inLinks = null;
when = new Date();
robotsTxtStatus = RobotsTxtStatus.UNKNOWN;
fetchStatus = FetchStatus.UN_FETCHED;
responseCode = null;
parserStatus = ParserStatus.NOT_PARSED;
indexStatus = IndexStatus.NOT_INDEXED;
count = 0;
md5size = null;
lastModifiedDate = null;
contentUpdateDate = null;
parentUrl = null;
redirectionUrl = null;
origin = null;
headers = null;
backlinkCount = 0;
depth = 0;
urlWhen = null;
}
protected void init(ResultDocument doc) {
setUrl(doc.getValueContent(UrlItemFieldEnum.INSTANCE.url.getName(), 0));
setHost(doc.getValueContent(UrlItemFieldEnum.INSTANCE.host.getName(), 0));
setSubHost(doc.getValues(UrlItemFieldEnum.INSTANCE.subhost.getName()));
addOutLinks(doc.getValues(UrlItemFieldEnum.INSTANCE.outlink.getName()));
addInLinks(doc.getValues(UrlItemFieldEnum.INSTANCE.inlink.getName()));
setContentDispositionFilename(
doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentDispositionFilename.getName(), 0));
setContentBaseType(doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentBaseType.getName(), 0));
setContentTypeCharset(doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0));
setContentLength(doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentLength.getName(), 0));
setContentEncoding(doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), 0));
setLang(doc.getValueContent(UrlItemFieldEnum.INSTANCE.lang.getName(), 0));
setLangMethod(doc.getValueContent(UrlItemFieldEnum.INSTANCE.langMethod.getName(), 0));
setWhen(doc.getValueContent(UrlItemFieldEnum.INSTANCE.when.getName(), 0));
setRobotsTxtStatusInt(doc.getValueContent(UrlItemFieldEnum.INSTANCE.robotsTxtStatus.getName(), 0));
setFetchStatusInt(doc.getValueContent(UrlItemFieldEnum.INSTANCE.fetchStatus.getName(), 0));
setResponseCode(doc.getValueContent(UrlItemFieldEnum.INSTANCE.responseCode.getName(), 0));
setParserStatusInt(doc.getValueContent(UrlItemFieldEnum.INSTANCE.parserStatus.getName(), 0));
setIndexStatusInt(doc.getValueContent(UrlItemFieldEnum.INSTANCE.indexStatus.getName(), 0));
setMd5size(doc.getValueContent(UrlItemFieldEnum.INSTANCE.md5size.getName(), 0));
setLastModifiedDate(doc.getValueContent(UrlItemFieldEnum.INSTANCE.lastModifiedDate.getName(), 0));
setContentUpdateDate(doc.getValueContent(UrlItemFieldEnum.INSTANCE.contentUpdateDate.getName(), 0));
setParentUrl(doc.getValueContent(UrlItemFieldEnum.INSTANCE.parentUrl.getName(), 0));
setRedirectionUrl(doc.getValueContent(UrlItemFieldEnum.INSTANCE.redirectionUrl.getName(), 0));
setOrigin(LinkItem.findOrigin(doc.getValueContent(UrlItemFieldEnum.INSTANCE.origin.getName(), 0)));
addHeaders(doc.getValues(UrlItemFieldEnum.INSTANCE.headers.getName()));
setBacklinkCount(doc.getValueContent(UrlItemFieldEnum.INSTANCE.backlinkCount.getName(), 0));
setDepth(doc.getValueContent(UrlItemFieldEnum.INSTANCE.depth.getName(), 0));
urlWhen = doc.getValueContent(UrlItemFieldEnum.INSTANCE.urlWhen.getName(), 0);
}
private void addHeaders(List headersList) {
if (headersList == null)
return;
if (headers == null)
headers = new ArrayList();
for (FieldValueItem item : headersList)
headers.add(item.getValue());
}
public List getSubHost() {
return subhost;
}
public List getOutLinks() {
return outLinks;
}
public List getInLinks() {
return inLinks;
}
public void setSubHost(List subhostlist) {
this.subhost = null;
if (subhostlist == null)
return;
this.subhost = new ArrayList();
for (FieldValueItem item : subhostlist)
this.subhost.add(item.getValue());
}
public void clearOutLinks() {
if (outLinks == null)
return;
outLinks.clear();
}
public void addOutLinks(List linkList) {
if (linkList == null)
return;
if (outLinks == null)
outLinks = new ArrayList();
for (FieldValueItem item : linkList)
outLinks.add(item.getValue());
}
public void addOutLinks(FieldContent fieldContent) {
if (fieldContent == null)
return;
addOutLinks(fieldContent.getValues());
}
public void clearInLinks() {
if (inLinks == null)
return;
inLinks.clear();
}
public void addInLinks(List linkList) {
if (linkList == null)
return;
if (inLinks == null)
inLinks = new ArrayList();
for (FieldValueItem item : linkList)
inLinks.add(item.getValue());
}
public void addInLinks(FieldContent fieldContent) {
if (fieldContent == null)
return;
addInLinks(fieldContent.getValues());
}
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public FetchStatus getFetchStatus() {
if (fetchStatus == null)
return FetchStatus.UN_FETCHED;
return fetchStatus;
}
public void setParserStatus(ParserStatus status) {
this.parserStatus = status;
}
public void setParserStatusInt(int v) {
this.parserStatus = ParserStatus.find(v);
}
private void setParserStatusInt(String v) {
if (v != null)
setParserStatusInt(Integer.parseInt(v));
}
public String getContentTypeCharset() {
return contentTypeCharset;
}
public void setContentTypeCharset(String v) {
contentTypeCharset = v;
}
public String getContentDispositionFilename() {
return contentDispositionFilename;
}
public void setContentDispositionFilename(String v) {
contentDispositionFilename = v;
}
public String getContentBaseType() {
return contentBaseType;
}
public void setContentBaseType(String v) {
contentBaseType = v;
}
public void setContentEncoding(String v) {
contentEncoding = v;
}
public String getContentEncoding() {
return contentEncoding;
}
private void setContentLength(String v) {
if (v == null)
return;
if (v.length() == 0)
return;
try {
contentLength = longFormat.parse(v).longValue();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setContentLength(Long v) {
contentLength = v;
}
public Long getContentLength() {
return contentLength;
}
public ParserStatus getParserStatus() {
if (parserStatus == null)
return ParserStatus.NOT_PARSED;
return parserStatus;
}
public void setIndexStatus(IndexStatus status) {
this.indexStatus = status;
}
public void setIndexStatusInt(int v) {
this.indexStatus = IndexStatus.find(v);
}
private void setIndexStatusInt(String v) {
if (v != null)
setIndexStatusInt(Integer.parseInt(v));
}
public IndexStatus getIndexStatus() {
if (indexStatus == null)
return IndexStatus.NOT_INDEXED;
return indexStatus;
}
public RobotsTxtStatus getRobotsTxtStatus() {
if (robotsTxtStatus == null)
return RobotsTxtStatus.UNKNOWN;
return robotsTxtStatus;
}
public void setRobotsTxtStatus(RobotsTxtStatus status) {
this.robotsTxtStatus = status;
}
public void setRobotsTxtStatusInt(int v) {
this.robotsTxtStatus = RobotsTxtStatus.find(v);
}
private void setRobotsTxtStatusInt(String v) {
if (v != null)
setRobotsTxtStatusInt(Integer.parseInt(v));
}
public void setFetchStatus(FetchStatus status) {
this.fetchStatus = status;
}
public void setFetchStatusInt(int v) {
this.fetchStatus = FetchStatus.find(v);
}
private void setFetchStatusInt(String v) {
if (v != null)
setFetchStatusInt(Integer.parseInt(v));
}
private void setResponseCode(String v) {
if (v != null)
responseCode = new Integer(v);
}
public void setResponseCode(Integer v) {
responseCode = v;
}
public Integer getResponseCode() {
return responseCode;
}
public String getUrl() {
return urlString;
}
public URL getURL() {
return url;
}
public void setUrl(String url) {
synchronized (this) {
this.urlString = url;
this.url = LinkUtils.getURL(urlString, false);
checkUrlWhen();
}
}
public String getParentUrl() {
return parentUrl;
}
public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}
public boolean isRedirection() {
return redirectionUrl != null;
}
public String getRedirectionUrl() {
return redirectionUrl;
}
public void setRedirectionUrl(String redirectionUrl) {
this.redirectionUrl = redirectionUrl;
}
public LinkItem.Origin getOrigin() {
return origin;
}
public void setOrigin(LinkItem.Origin origin) {
this.origin = origin;
}
public Date getWhen() {
return when;
}
public void setWhen(Date d) {
if (d == null) {
setWhenNow();
return;
}
when = d;
}
public Date getLastModifiedDate() {
return lastModifiedDate;
}
protected void setLastModifiedDate(String d) {
try {
this.lastModifiedDate = d == null ? null : whenDateFormat.parse(d);
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setLastModifiedDate(Date d) {
this.lastModifiedDate = d;
}
public void setLastModifiedDate(Long time) {
this.lastModifiedDate = time == null ? null : new Date(time);
}
public Date getContentUpdateDate() {
return contentUpdateDate;
}
protected void setContentUpdateDate(String d) {
try {
this.contentUpdateDate = d == null ? null : whenDateFormat.parse(d);
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setContentUpdateDate(Date d) {
this.contentUpdateDate = d;
}
final static ThreadSafeSimpleDateFormat whenDateFormat = new ThreadSafeSimpleDateFormat("yyyyMMddHHmmss");
final static ThreadSafeDecimalFormat longFormat = new ThreadSafeDecimalFormat("00000000000000");
protected void setWhen(String d) {
if (d == null) {
setWhenNow();
return;
}
try {
when = whenDateFormat.parse(d);
checkUrlWhen();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
setWhenNow();
}
}
public void setWhenNow() {
setWhen(new Date(System.currentTimeMillis()));
checkUrlWhen();
}
public String getCount() {
return Integer.toString(count);
}
public static List buildSubHost(String host) {
if (host == null)
return null;
List subhost = new ArrayList();
int lastPos = host.length();
while (lastPos > 0) {
lastPos = host.lastIndexOf('.', lastPos - 1);
if (lastPos == -1)
break;
subhost.add(host.substring(lastPos + 1));
}
subhost.add(host);
return subhost;
}
public void populate(IndexDocument indexDocument) throws IOException {
indexDocument.setString(UrlItemFieldEnum.INSTANCE.url.getName(), getUrl());
indexDocument.setString(UrlItemFieldEnum.INSTANCE.when.getName(), whenDateFormat.format(when));
if (url != null) {
indexDocument.setString(UrlItemFieldEnum.INSTANCE.host.getName(), url.getHost());
indexDocument.setStringList(UrlItemFieldEnum.INSTANCE.subhost.getName(), buildSubHost(url.getHost()));
}
if (inLinks != null)
indexDocument.setStringList(UrlItemFieldEnum.INSTANCE.inlink.getName(), inLinks);
if (outLinks != null)
indexDocument.setStringList(UrlItemFieldEnum.INSTANCE.outlink.getName(), outLinks);
if (responseCode != null)
indexDocument.setObject(UrlItemFieldEnum.INSTANCE.responseCode.getName(), responseCode);
if (contentDispositionFilename != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.contentDispositionFilename.getName(),
contentDispositionFilename);
if (contentBaseType != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.contentBaseType.getName(), contentBaseType);
if (contentTypeCharset != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), contentTypeCharset);
if (contentLength != null)
indexDocument
.setString(UrlItemFieldEnum.INSTANCE.contentLength.getName(), longFormat.format(contentLength));
if (contentEncoding != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), contentEncoding);
if (lang != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.lang.getName(), lang);
if (langMethod != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.langMethod.getName(), langMethod);
indexDocument.setObject(UrlItemFieldEnum.INSTANCE.robotsTxtStatus.getName(), robotsTxtStatus.value);
indexDocument.setObject(UrlItemFieldEnum.INSTANCE.fetchStatus.getName(), fetchStatus.value);
indexDocument.setObject(UrlItemFieldEnum.INSTANCE.parserStatus.getName(), parserStatus.value);
indexDocument.setObject(UrlItemFieldEnum.INSTANCE.indexStatus.getName(), indexStatus.value);
if (md5size != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.md5size.getName(), md5size);
if (lastModifiedDate != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.lastModifiedDate.getName(),
whenDateFormat.format(lastModifiedDate));
if (contentUpdateDate != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.contentUpdateDate.getName(),
whenDateFormat.format(contentUpdateDate));
if (parentUrl != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.parentUrl.getName(), parentUrl);
if (redirectionUrl != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.redirectionUrl.getName(), redirectionUrl);
if (origin != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.origin.getName(), origin.name());
if (headers != null)
indexDocument.setStringList(UrlItemFieldEnum.INSTANCE.headers.getName(), headers);
indexDocument.setString(UrlItemFieldEnum.INSTANCE.backlinkCount.getName(), longFormat.format(backlinkCount));
indexDocument.setString(UrlItemFieldEnum.INSTANCE.depth.getName(), longFormat.format(depth));
indexDocument.setString(UrlItemFieldEnum.INSTANCE.urlWhen.getName(), urlWhen);
}
public String getLang() {
return lang;
}
public void setLang(String lang) {
this.lang = lang;
}
public String getLangMethod() {
return langMethod;
}
public String getFullLang() {
StringBuilder sb = new StringBuilder();
if (lang != null)
sb.append(lang);
if (langMethod != null) {
sb.append('(');
sb.append(langMethod);
sb.append(')');
}
return sb.toString();
}
public void setLangMethod(String langMethod) {
this.langMethod = langMethod;
}
public TargetStatus getTargetResult() {
if (robotsTxtStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return robotsTxtStatus.targetStatus;
if (fetchStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return fetchStatus.targetStatus;
if (parserStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return parserStatus.targetStatus;
return indexStatus.targetStatus;
}
public String getMd5size() {
return md5size;
}
public void setMd5size(String md5size) {
this.md5size = md5size;
}
public List getHeaders() {
return headers;
}
public void setHeaders(List headers) {
this.headers = headers;
}
/**
* @return the backLinkCount
*/
public int getBacklinkCount() {
return backlinkCount;
}
/**
* @param v the backLinkCount to set
*/
private void setBacklinkCount(String v) {
if (v == null)
return;
if (v.length() == 0)
return;
try {
backlinkCount = longFormat.parse(v).intValue();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
/**
* @param backLinkCount the backLinkCount to set
*/
public void setBacklinkCount(int backLinkCount) {
this.backlinkCount = backLinkCount;
}
/**
* @return the depth
*/
public int getDepth() {
return depth;
}
/**
* @param depth the depth to set
*/
public void setDepth(int depth) {
this.depth = depth;
}
/**
* @param v the depth to set
*/
private void setDepth(String v) {
if (v == null)
return;
if (v.length() == 0)
return;
try {
depth = longFormat.parse(v).intValue();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void checkUrlWhen() {
StringBuilder sb = new StringBuilder();
if (when != null)
sb.append(whenDateFormat.format(when));
if (urlString != null)
sb.append(urlString);
urlWhen = sb.length() == 0 ? null : sb.toString();
}
}