
com.jaeksoft.searchlib.crawler.web.database.UrlItem Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface,
the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and
easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and
Linux/Unix/BSD.
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.web.database;
import java.io.IOException;
import java.net.URL;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.crawler.TargetStatus;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.result.ResultDocument;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDecimalFormat;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeSimpleDateFormat;
import com.jaeksoft.searchlib.util.LinkUtils;
public class UrlItem {
private String urlString;
private URL url;
private String contentDispositionFilename;
private String contentBaseType;
private String contentTypeCharset;
private Long contentLength;
private String contentEncoding;
private String lang;
private String langMethod;
private String host;
private List subhost;
private Date when;
private RobotsTxtStatus robotsTxtStatus;
private FetchStatus fetchStatus;
private Integer responseCode;
private ParserStatus parserStatus;
private IndexStatus indexStatus;
private int count;
private String md5size;
private Date lastModifiedDate;
private Date contentUpdateDate;
private List outLinks;
private List inLinks;
private String parentUrl;
private String redirectionUrl;
private LinkItem.Origin origin;
private List headers;
private int backlinkCount;
private String urlWhen;
protected UrlItem() {
urlString = null;
url = null;
contentDispositionFilename = null;
contentBaseType = null;
contentTypeCharset = null;
contentLength = null;
contentEncoding = null;
lang = null;
langMethod = null;
host = null;
subhost = null;
outLinks = null;
inLinks = null;
when = new Date();
robotsTxtStatus = RobotsTxtStatus.UNKNOWN;
fetchStatus = FetchStatus.UN_FETCHED;
responseCode = null;
parserStatus = ParserStatus.NOT_PARSED;
indexStatus = IndexStatus.NOT_INDEXED;
count = 0;
md5size = null;
lastModifiedDate = null;
contentUpdateDate = null;
parentUrl = null;
redirectionUrl = null;
origin = null;
headers = null;
backlinkCount = 0;
urlWhen = null;
}
protected void init(ResultDocument doc) {
setUrl(doc.getValueContent(UrlItemFieldEnum.INSTANCE.url.getName(), 0));
setHost(doc
.getValueContent(UrlItemFieldEnum.INSTANCE.host.getName(), 0));
setSubHost(doc.getValues(UrlItemFieldEnum.INSTANCE.subhost.getName()));
addOutLinks(doc.getValues(UrlItemFieldEnum.INSTANCE.outlink.getName()));
addInLinks(doc.getValues(UrlItemFieldEnum.INSTANCE.inlink.getName()));
setContentDispositionFilename(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentDispositionFilename.getName(),
0));
setContentBaseType(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentBaseType.getName(), 0));
setContentTypeCharset(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0));
setContentLength(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentLength.getName(), 0));
setContentEncoding(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), 0));
setLang(doc
.getValueContent(UrlItemFieldEnum.INSTANCE.lang.getName(), 0));
setLangMethod(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.langMethod.getName(), 0));
setWhen(doc
.getValueContent(UrlItemFieldEnum.INSTANCE.when.getName(), 0));
setRobotsTxtStatusInt(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.robotsTxtStatus.getName(), 0));
setFetchStatusInt(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.fetchStatus.getName(), 0));
setResponseCode(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.responseCode.getName(), 0));
setParserStatusInt(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.parserStatus.getName(), 0));
setIndexStatusInt(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.indexStatus.getName(), 0));
setMd5size(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.md5size.getName(), 0));
setLastModifiedDate(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.lastModifiedDate.getName(), 0));
setContentUpdateDate(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.contentUpdateDate.getName(), 0));
setParentUrl(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.parentUrl.getName(), 0));
setRedirectionUrl(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.redirectionUrl.getName(), 0));
setOrigin(LinkItem.findOrigin(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.origin.getName(), 0)));
addHeaders(doc.getValues(UrlItemFieldEnum.INSTANCE.headers.getName()));
setBacklinkCount(doc.getValueContent(
UrlItemFieldEnum.INSTANCE.backlinkCount.getName(), 0));
urlWhen = doc.getValueContent(
UrlItemFieldEnum.INSTANCE.urlWhen.getName(), 0);
}
private void addHeaders(List headersList) {
if (headersList == null)
return;
if (headers == null)
headers = new ArrayList();
for (FieldValueItem item : headersList)
headers.add(item.getValue());
}
public List getSubHost() {
return subhost;
}
public List getOutLinks() {
return outLinks;
}
public List getInLinks() {
return inLinks;
}
public void setSubHost(List subhostlist) {
this.subhost = null;
if (subhostlist == null)
return;
this.subhost = new ArrayList();
for (FieldValueItem item : subhostlist)
this.subhost.add(item.getValue());
}
public void clearOutLinks() {
if (outLinks == null)
return;
outLinks.clear();
}
public void addOutLinks(List linkList) {
if (linkList == null)
return;
if (outLinks == null)
outLinks = new ArrayList();
for (FieldValueItem item : linkList)
outLinks.add(item.getValue());
}
public void addOutLinks(FieldContent fieldContent) {
if (fieldContent == null)
return;
addOutLinks(fieldContent.getValues());
}
public void clearInLinks() {
if (inLinks == null)
return;
inLinks.clear();
}
public void addInLinks(List linkList) {
if (linkList == null)
return;
if (inLinks == null)
inLinks = new ArrayList();
for (FieldValueItem item : linkList)
inLinks.add(item.getValue());
}
public void addInLinks(FieldContent fieldContent) {
if (fieldContent == null)
return;
addInLinks(fieldContent.getValues());
}
public String getHost() {
return host;
}
public void setHost(String host) {
this.host = host;
}
public FetchStatus getFetchStatus() {
if (fetchStatus == null)
return FetchStatus.UN_FETCHED;
return fetchStatus;
}
public void setParserStatus(ParserStatus status) {
this.parserStatus = status;
}
public void setParserStatusInt(int v) {
this.parserStatus = ParserStatus.find(v);
}
private void setParserStatusInt(String v) {
if (v != null)
setParserStatusInt(Integer.parseInt(v));
}
public String getContentTypeCharset() {
return contentTypeCharset;
}
public void setContentTypeCharset(String v) {
contentTypeCharset = v;
}
public String getContentDispositionFilename() {
return contentDispositionFilename;
}
public void setContentDispositionFilename(String v) {
contentDispositionFilename = v;
}
public String getContentBaseType() {
return contentBaseType;
}
public void setContentBaseType(String v) {
contentBaseType = v;
}
public void setContentEncoding(String v) {
contentEncoding = v;
}
public String getContentEncoding() {
return contentEncoding;
}
private void setContentLength(String v) {
if (v == null)
return;
if (v.length() == 0)
return;
try {
contentLength = longFormat.parse(v).longValue();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setContentLength(Long v) {
contentLength = v;
}
public Long getContentLength() {
return contentLength;
}
public ParserStatus getParserStatus() {
if (parserStatus == null)
return ParserStatus.NOT_PARSED;
return parserStatus;
}
public void setIndexStatus(IndexStatus status) {
this.indexStatus = status;
}
public void setIndexStatusInt(int v) {
this.indexStatus = IndexStatus.find(v);
}
private void setIndexStatusInt(String v) {
if (v != null)
setIndexStatusInt(Integer.parseInt(v));
}
public IndexStatus getIndexStatus() {
if (indexStatus == null)
return IndexStatus.NOT_INDEXED;
return indexStatus;
}
public RobotsTxtStatus getRobotsTxtStatus() {
if (robotsTxtStatus == null)
return RobotsTxtStatus.UNKNOWN;
return robotsTxtStatus;
}
public void setRobotsTxtStatus(RobotsTxtStatus status) {
this.robotsTxtStatus = status;
}
public void setRobotsTxtStatusInt(int v) {
this.robotsTxtStatus = RobotsTxtStatus.find(v);
}
private void setRobotsTxtStatusInt(String v) {
if (v != null)
setRobotsTxtStatusInt(Integer.parseInt(v));
}
public void setFetchStatus(FetchStatus status) {
this.fetchStatus = status;
}
public void setFetchStatusInt(int v) {
this.fetchStatus = FetchStatus.find(v);
}
private void setFetchStatusInt(String v) {
if (v != null)
setFetchStatusInt(Integer.parseInt(v));
}
private void setResponseCode(String v) {
if (v != null)
responseCode = new Integer(v);
}
public void setResponseCode(Integer v) {
responseCode = v;
}
public Integer getResponseCode() {
return responseCode;
}
public String getUrl() {
return urlString;
}
public URL getURL() {
return url;
}
public void setUrl(String url) {
synchronized (this) {
this.urlString = url;
this.url = LinkUtils.getURL(urlString, false);
checkUrlWhen();
}
}
public String getParentUrl() {
return parentUrl;
}
public void setParentUrl(String parentUrl) {
this.parentUrl = parentUrl;
}
public boolean isRedirection() {
return redirectionUrl != null;
}
public String getRedirectionUrl() {
return redirectionUrl;
}
public void setRedirectionUrl(String redirectionUrl) {
this.redirectionUrl = redirectionUrl;
}
public LinkItem.Origin getOrigin() {
return origin;
}
public void setOrigin(LinkItem.Origin origin) {
this.origin = origin;
}
public Date getWhen() {
return when;
}
public void setWhen(Date d) {
if (d == null) {
setWhenNow();
return;
}
when = d;
}
public Date getLastModifiedDate() {
return lastModifiedDate;
}
protected void setLastModifiedDate(String d) {
try {
this.lastModifiedDate = d == null ? null : whenDateFormat.parse(d);
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setLastModifiedDate(Date d) {
this.lastModifiedDate = d;
}
public void setLastModifiedDate(Long time) {
this.lastModifiedDate = time == null ? null : new Date(time);
}
public Date getContentUpdateDate() {
return contentUpdateDate;
}
protected void setContentUpdateDate(String d) {
try {
this.contentUpdateDate = d == null ? null : whenDateFormat.parse(d);
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
public void setContentUpdateDate(Date d) {
this.contentUpdateDate = d;
}
final static ThreadSafeSimpleDateFormat whenDateFormat = new ThreadSafeSimpleDateFormat(
"yyyyMMddHHmmss");
final static ThreadSafeDecimalFormat longFormat = new ThreadSafeDecimalFormat(
"00000000000000");
protected void setWhen(String d) {
if (d == null) {
setWhenNow();
return;
}
try {
when = whenDateFormat.parse(d);
checkUrlWhen();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
setWhenNow();
}
}
public void setWhenNow() {
setWhen(new Date(System.currentTimeMillis()));
checkUrlWhen();
}
public String getCount() {
return Integer.toString(count);
}
public static List buildSubHost(String host) {
if (host == null)
return null;
List subhost = new ArrayList();
int lastPos = host.length();
while (lastPos > 0) {
lastPos = host.lastIndexOf('.', lastPos - 1);
if (lastPos == -1)
break;
subhost.add(host.substring(lastPos + 1));
}
subhost.add(host);
return subhost;
}
public void populate(IndexDocument indexDocument) throws IOException {
indexDocument.setString(UrlItemFieldEnum.INSTANCE.url.getName(),
getUrl());
indexDocument.setString(UrlItemFieldEnum.INSTANCE.when.getName(),
whenDateFormat.format(when));
if (url != null) {
indexDocument.setString(UrlItemFieldEnum.INSTANCE.host.getName(),
url.getHost());
indexDocument.setStringList(
UrlItemFieldEnum.INSTANCE.subhost.getName(),
buildSubHost(url.getHost()));
}
if (inLinks != null)
indexDocument.setStringList(
UrlItemFieldEnum.INSTANCE.inlink.getName(), inLinks);
if (outLinks != null)
indexDocument.setStringList(
UrlItemFieldEnum.INSTANCE.outlink.getName(), outLinks);
if (responseCode != null)
indexDocument.setObject(
UrlItemFieldEnum.INSTANCE.responseCode.getName(),
responseCode);
if (contentDispositionFilename != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentDispositionFilename
.getName(), contentDispositionFilename);
if (contentBaseType != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentBaseType.getName(),
contentBaseType);
if (contentTypeCharset != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(),
contentTypeCharset);
if (contentLength != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentLength.getName(),
longFormat.format(contentLength));
if (contentEncoding != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentEncoding.getName(),
contentEncoding);
if (lang != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.lang.getName(),
lang);
if (langMethod != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.langMethod.getName(), langMethod);
indexDocument.setObject(
UrlItemFieldEnum.INSTANCE.robotsTxtStatus.getName(),
robotsTxtStatus.value);
indexDocument.setObject(
UrlItemFieldEnum.INSTANCE.fetchStatus.getName(),
fetchStatus.value);
indexDocument.setObject(
UrlItemFieldEnum.INSTANCE.parserStatus.getName(),
parserStatus.value);
indexDocument.setObject(
UrlItemFieldEnum.INSTANCE.indexStatus.getName(),
indexStatus.value);
if (md5size != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.md5size.getName(), md5size);
if (lastModifiedDate != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.lastModifiedDate.getName(),
whenDateFormat.format(lastModifiedDate));
if (contentUpdateDate != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.contentUpdateDate.getName(),
whenDateFormat.format(contentUpdateDate));
if (parentUrl != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.parentUrl.getName(), parentUrl);
if (redirectionUrl != null)
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.redirectionUrl.getName(),
redirectionUrl);
if (origin != null)
indexDocument.setString(UrlItemFieldEnum.INSTANCE.origin.getName(),
origin.name());
if (headers != null)
indexDocument.setStringList(
UrlItemFieldEnum.INSTANCE.headers.getName(), headers);
indexDocument.setString(
UrlItemFieldEnum.INSTANCE.backlinkCount.getName(),
longFormat.format(backlinkCount));
indexDocument.setString(UrlItemFieldEnum.INSTANCE.urlWhen.getName(),
urlWhen);
}
public String getLang() {
return lang;
}
public void setLang(String lang) {
this.lang = lang;
}
public String getLangMethod() {
return langMethod;
}
public String getFullLang() {
StringBuilder sb = new StringBuilder();
if (lang != null)
sb.append(lang);
if (langMethod != null) {
sb.append('(');
sb.append(langMethod);
sb.append(')');
}
return sb.toString();
}
public void setLangMethod(String langMethod) {
this.langMethod = langMethod;
}
public TargetStatus getTargetResult() {
if (robotsTxtStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return robotsTxtStatus.targetStatus;
if (fetchStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return fetchStatus.targetStatus;
if (parserStatus.targetStatus != TargetStatus.TARGET_UPDATE)
return parserStatus.targetStatus;
return indexStatus.targetStatus;
}
public String getMd5size() {
return md5size;
}
public void setMd5size(String md5size) {
this.md5size = md5size;
}
public List getHeaders() {
return headers;
}
public void setHeaders(List headers) {
this.headers = headers;
}
/**
* @return the backLinkCount
*/
public int getBacklinkCount() {
return backlinkCount;
}
/**
* @param backLinkCount
* the backLinkCount to set
*/
private void setBacklinkCount(String v) {
if (v == null)
return;
if (v.length() == 0)
return;
try {
backlinkCount = longFormat.parse(v).intValue();
} catch (ParseException e) {
Logging.error(e.getMessage(), e);
}
}
/**
* @param backLinkCount
* the backLinkCount to set
*/
public void setBacklinkCount(int backLinkCount) {
this.backlinkCount = backLinkCount;
}
public void checkUrlWhen() {
StringBuilder sb = new StringBuilder();
if (when != null)
sb.append(whenDateFormat.format(when));
if (urlString != null)
sb.append(urlString);
urlWhen = sb.length() == 0 ? null : sb.toString();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy