com.jaeksoft.searchlib.index.IndexDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**
 * License Agreement for OpenSearchServer
 * 
 * Copyright (C) 2008-2015 Emmanuel Keller / Jaeksoft
 * 

 * http://www.open-search-server.com
 * 

 * This file is part of OpenSearchServer.
 * 

 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 

 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 

 * You should have received a copy of the GNU General Public License
 * along with OpenSearchServer.
 * If not, see .
 **/

package com.jaeksoft.searchlib.index;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.crawler.web.database.CredentialItem;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.logreport.ErrorParserLogger;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.parser.ParserSelector;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.schema.FieldValueOriginEnum;
import com.jaeksoft.searchlib.schema.Schema;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import org.apache.commons.lang3.StringEscapeUtils;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;

import javax.xml.xpath.XPathExpressionException;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

public class IndexDocument implements Iterable {

	private final Map fields;
	private LanguageEnum lang;

	public IndexDocument() {
		fields = new TreeMap();
		this.lang = null;
	}

	public IndexDocument(IndexDocument sourceDocument) {
		this(sourceDocument.lang);
		for (Map.Entry entry : sourceDocument.fields.entrySet())
			add(entry.getKey(), entry.getValue());
	}

	public IndexDocument(LanguageEnum lang) {
		this();
		this.lang = lang;
	}

	public IndexDocument(Locale lang) {
		this();
		if (lang != null)
			this.lang = LanguageEnum.findByCode(lang.getLanguage());
	}

	private final List getCopyFieldList(Node fieldNode) throws XPathExpressionException {
		List copyNodes = DomUtils.getNodes(fieldNode, "copy");
		if (copyNodes == null || copyNodes.size() == 0)
			return null;
		List copyList = new ArrayList();
		for (Node copyNode : copyNodes) {
			String f = XPathParser.getAttributeString(copyNode, "field");
			if (f != null)
				copyList.add(f);
		}
		return copyList;
	}

	/**
	 * Create a new instance of IndexDocument from an XML structure 

	 * 

	 *   VALUE1

	 *   VALUE2

	 * 
	 *
	 * @param client
	 * @param parserSelector
	 * @param documentNode
	 * @param urlDefaultCredential
	 * @param httpDownloader
	 * @throws XPathExpressionException
	 * @throws SearchLibException
	 * @throws ClassNotFoundException
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 * @throws DOMException
	 * @throws IOException
	 * @throws URISyntaxException
	 */
	public IndexDocument(Client client, ParserSelector parserSelector, Node documentNode,
			CredentialItem urlDefaultCredential, HttpDownloader httpDownloader)
			throws XPathExpressionException, SearchLibException, InstantiationException, IllegalAccessException,
			ClassNotFoundException, IOException, URISyntaxException {
		this(LanguageEnum.findByCode(XPathParser.getAttributeString(documentNode, "lang")));
		List fieldNodes = DomUtils.getNodes(documentNode, "field");
		for (Node fieldNode : fieldNodes) {
			List copyFieldList = getCopyFieldList(fieldNode);
			String fieldName = XPathParser.getAttributeString(fieldNode, "name");
			List valueNodes = DomUtils.getNodes(fieldNode, "value");
			for (Node valueNode : valueNodes) {
				boolean removeTag = "yes".equalsIgnoreCase(XPathParser.getAttributeString(valueNode, "removeTag"));
				boolean convertHtmlEntities =
						"yes".equalsIgnoreCase(XPathParser.getAttributeString(valueNode, "convertHtmlEntities"));

				String textContent = valueNode.getTextContent();
				if (convertHtmlEntities)
					textContent = StringEscapeUtils.unescapeHtml4(textContent);
				if (removeTag)
					textContent = StringUtils.removeTag(textContent);
				Float boost = XPathParser.getAttributeFloat(valueNode, "boost");
				add(fieldName, textContent, boost);
				if (copyFieldList != null)
					for (String f : copyFieldList)
						add(f, textContent, boost);
			}
		}
		List binaryNodes = DomUtils.getNodes(documentNode, "binary");
		for (Node node : binaryNodes) {
			boolean bFaultTolerant = "yes".equalsIgnoreCase(XPathParser.getAttributeString(node, "faultTolerant"));
			String filename = XPathParser.getAttributeString(node, "fileName");
			if (filename == null || filename.length() == 0)
				filename = XPathParser.getAttributeString(node, "filename");
			String filePath = XPathParser.getAttributeString(node, "filePath");
			if (filePath == null || filePath.length() == 0)
				filePath = XPathParser.getAttributeString(node, "filepath");
			String contentType = XPathParser.getAttributeString(node, "contentType");
			if (contentType == null || contentType.length() == 0)
				contentType = XPathParser.getAttributeString(node, "contenttype");
			String content = node.getTextContent();
			String url = XPathParser.getAttributeString(node, "url");
			Parser parser = doBinary(url, content, filePath, filename, client, parserSelector, contentType,
					urlDefaultCredential, httpDownloader, bFaultTolerant);
			if (parser != null)
				parser.popupateResult(0, this);
		}
	}

	private Parser doBinary(String url, String content, String filePath, String filename, Client client,
			ParserSelector parserSelector, String contentType, CredentialItem urlDefaultCredential,
			HttpDownloader httpDownloader, boolean bFaultTolerant)
			throws IOException, URISyntaxException, InstantiationException, IllegalAccessException,
			ClassNotFoundException, SearchLibException {
		try {
			Parser parser = null;
			if (url != null)
				parser = binaryFromUrl(parserSelector, url, urlDefaultCredential, httpDownloader);
			else if (content != null && content.length() > 0)
				parser = binaryFromBase64(parserSelector, filename, contentType, content);
			else if (filePath != null && filePath.length() > 0)
				parser = binaryFromFile(parserSelector, filename, contentType, filePath);
			return parser;
		} catch (SearchLibException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (NullPointerException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (IllegalArgumentException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (RuntimeException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw new SearchLibException(e);
		} catch (Exception e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw new SearchLibException(e);
		}
		return null;
	}

	private Parser binaryFromUrl(ParserSelector parserSelector, String url, CredentialItem credentialItem,
			HttpDownloader httpDownloader) throws SearchLibException {
		try {
			DownloadItem downloadItem = httpDownloader.get(new URI(url), credentialItem);
			downloadItem.checkNoErrorList(200);
			return parserSelector.parseStream(null, downloadItem.getFileName(), downloadItem.getContentBaseType(), url,
					downloadItem.getContentInputStream(), lang, null, null);
		} catch (RuntimeException e) {
			throw new SearchLibException("Parser error while getting binary from URL: " + url, e);
		} catch (Exception e) {
			throw new SearchLibException("Parser error while getting binary from URL: " + url, e);
		}
	}

	private Parser binaryFromBase64(ParserSelector parserSelector, String filename, String contentType, String content)
			throws SearchLibException {
		try {
			return parserSelector.parseBase64(null, filename, contentType, null, content, lang);
		} catch (RuntimeException e) {
			throw new SearchLibException("Parser error while getting binary : " + filename + " /" + contentType, e);
		} catch (Exception e) {
			throw new SearchLibException("Parser error while getting binary : " + filename + " /" + contentType, e);
		}
	}

	private Parser binaryFromFile(ParserSelector parserSelector, String filename, String contentType, String filePath)
			throws SearchLibException {
		try {
			File f = new File(filePath);
			if (f.isDirectory())
				f = new File(f, filename);
			return parserSelector.parseFile(null, filename, contentType, null, f, lang);
		} catch (RuntimeException e) {
			throw new SearchLibException("Parser error while getting binary from file : " + filePath + " /" + filename,
					e);
		} catch (Exception e) {
			throw new SearchLibException("Parser error while getting binary from file : " + filePath + " /" + filename,
					e);
		}
	}

	public FieldContent getFieldContent(String field) {
		if (field == null)
			return null;
		field = field.intern();
		FieldContent fc = fields.get(field);
		if (fc == null) {
			fc = new FieldContent(field);
			fields.put(field, fc);
		}
		return fc;
	}

	public void add(String field, FieldValueItem fieldValueItem) {
		if (field == null)
			return;
		FieldContent fc = getFieldContent(field);
		fc.add(fieldValueItem);
	}

	public void add(String field, String value, Float boost) {
		if (value == null || value.length() == 0)
			return;
		add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value, boost));
	}

	public void addObject(String field, Object object) {
		if (object == null)
			return;
		addString(field, object.toString());
	}

	public void addString(String field, String value) {
		if (value == null)
			return;
		add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value));
	}

	public void addFieldIndexDocument(String field, IndexDocument source) {
		if (source == null)
			return;
		for (FieldContent fieldContent : source)
			add(field, fieldContent);
	}

	public void addFieldValueList(String field, List values) {
		if (values == null)
			return;
		for (FieldValueItem value : values)
			add(field, value);
	}

	public void addObjectList(String field, List