com.jaeksoft.searchlib.index.IndexDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.index;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;

import javax.xml.xpath.XPathExpressionException;

import org.apache.commons.lang.StringEscapeUtils;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;

import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.crawler.web.database.CredentialItem;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader;
import com.jaeksoft.searchlib.logreport.ErrorParserLogger;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.parser.ParserSelector;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.schema.FieldValueOriginEnum;
import com.jaeksoft.searchlib.schema.Schema;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;

public class IndexDocument implements Iterable {

	private Map fields;
	private LanguageEnum lang;
	private FieldContent[] fieldContentArray;

	public IndexDocument() {
		fields = new TreeMap();
		this.lang = null;
		fieldContentArray = null;
	}

	public IndexDocument(IndexDocument sourceDocument) {
		this(sourceDocument.lang);
		for (Map.Entry entry : sourceDocument.fields
				.entrySet())
			add(entry.getKey(), entry.getValue());
	}

	public IndexDocument(LanguageEnum lang) {
		this();
		this.lang = lang;
	}

	public IndexDocument(Locale lang) {
		this();
		if (lang != null)
			this.lang = LanguageEnum.findByCode(lang.getLanguage());
	}

	private final List getCopyFieldList(Node fieldNode)
			throws XPathExpressionException {
		List copyNodes = DomUtils.getNodes(fieldNode, "copy");
		if (copyNodes == null || copyNodes.size() == 0)
			return null;
		List copyList = new ArrayList();
		for (Node copyNode : copyNodes) {
			String f = XPathParser.getAttributeString(copyNode, "field");
			if (f != null)
				copyList.add(f);
		}
		return copyList;
	}

	/**
	 * Create a new instance of IndexDocument from an XML structure 

	 * 

	 *   VALUE1

	 *   VALUE2

	 * 
	 * 
	 * @param xpp
	 * @param documentNode
	 * @throws XPathExpressionException
	 * @throws SearchLibException
	 * @throws ClassNotFoundException
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 * @throws DOMException
	 * @throws IOException
	 * @throws URISyntaxException
	 */
	public IndexDocument(Client client, ParserSelector parserSelector,
			Node documentNode, CredentialItem urlDefaultCredential,
			HttpDownloader httpDownloader) throws XPathExpressionException,
			SearchLibException, InstantiationException, IllegalAccessException,
			ClassNotFoundException, IOException, URISyntaxException {
		this(LanguageEnum.findByCode(XPathParser.getAttributeString(
				documentNode, "lang")));
		List fieldNodes = DomUtils.getNodes(documentNode, "field");
		for (Node fieldNode : fieldNodes) {
			List copyFieldList = getCopyFieldList(fieldNode);
			String fieldName = XPathParser
					.getAttributeString(fieldNode, "name");
			List valueNodes = DomUtils.getNodes(fieldNode, "value");
			for (Node valueNode : valueNodes) {
				boolean removeTag = "yes".equalsIgnoreCase(XPathParser
						.getAttributeString(valueNode, "removeTag"));
				boolean convertHtmlEntities = "yes"
						.equalsIgnoreCase(XPathParser.getAttributeString(
								valueNode, "convertHtmlEntities"));

				String textContent = valueNode.getTextContent();
				if (convertHtmlEntities)
					textContent = StringEscapeUtils.unescapeHtml(textContent);
				if (removeTag)
					textContent = StringUtils.removeTag(textContent);
				Float boost = XPathParser.getAttributeFloat(valueNode, "boost");
				add(fieldName, textContent, boost);
				if (copyFieldList != null)
					for (String f : copyFieldList)
						add(f, textContent, boost);
			}
		}
		List binaryNodes = DomUtils.getNodes(documentNode, "binary");
		for (Node node : binaryNodes) {
			boolean bFaultTolerant = "yes".equalsIgnoreCase(XPathParser
					.getAttributeString(node, "faultTolerant"));
			String filename = XPathParser.getAttributeString(node, "fileName");
			if (filename == null || filename.length() == 0)
				filename = XPathParser.getAttributeString(node, "filename");
			String filePath = XPathParser.getAttributeString(node, "filePath");
			if (filePath == null || filePath.length() == 0)
				filePath = XPathParser.getAttributeString(node, "filepath");
			String contentType = XPathParser.getAttributeString(node,
					"contentType");
			if (contentType == null || contentType.length() == 0)
				contentType = XPathParser.getAttributeString(node,
						"contenttype");
			String content = node.getTextContent();
			String url = XPathParser.getAttributeString(node, "url");
			Parser parser = doBinary(url, content, filePath, filename, client,
					parserSelector, contentType, urlDefaultCredential,
					httpDownloader, bFaultTolerant);
			if (parser != null)
				parser.popupateResult(0, this);
		}
	}

	private Parser doBinary(String url, String content, String filePath,
			String filename, Client client, ParserSelector parserSelector,
			String contentType, CredentialItem urlDefaultCredential,
			HttpDownloader httpDownloader, boolean bFaultTolerant)
			throws IOException, URISyntaxException, InstantiationException,
			IllegalAccessException, ClassNotFoundException, SearchLibException {
		try {
			Parser parser = null;
			if (url != null)
				parser = binaryFromUrl(parserSelector, url,
						urlDefaultCredential, httpDownloader);
			else if (content != null && content.length() > 0)
				parser = binaryFromBase64(parserSelector, filename,
						contentType, content);
			else if (filePath != null && filePath.length() > 0)
				parser = binaryFromFile(parserSelector, filename, contentType,
						filePath);
			return parser;
		} catch (SearchLibException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (NullPointerException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (IllegalArgumentException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw e;
		} catch (RuntimeException e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw new SearchLibException(e);
		} catch (Exception e) {
			ErrorParserLogger.log(url, filename, filePath, e);
			if (!bFaultTolerant)
				throw new SearchLibException(e);
		}
		return null;
	}

	private Parser binaryFromUrl(ParserSelector parserSelector, String url,
			CredentialItem credentialItem, HttpDownloader httpDownloader)
			throws SearchLibException {
		try {
			DownloadItem downloadItem = httpDownloader.get(new URI(url),
					credentialItem);
			downloadItem.checkNoErrorList(200);
			return parserSelector.parseStream(null, downloadItem.getFileName(),
					downloadItem.getContentBaseType(), url,
					downloadItem.getContentInputStream(), lang, null, null);
		} catch (RuntimeException e) {
			throw new SearchLibException(
					"Parser error while getting binary from URL: " + url, e);
		} catch (Exception e) {
			throw new SearchLibException(
					"Parser error while getting binary from URL: " + url, e);
		}
	}

	private Parser binaryFromBase64(ParserSelector parserSelector,
			String filename, String contentType, String content)
			throws SearchLibException {
		try {
			return parserSelector.parseBase64(null, filename, contentType,
					null, content, lang);
		} catch (RuntimeException e) {
			throw new SearchLibException("Parser error while getting binary : "
					+ filename + " /" + contentType, e);
		} catch (Exception e) {
			throw new SearchLibException("Parser error while getting binary : "
					+ filename + " /" + contentType, e);
		}
	}

	private Parser binaryFromFile(ParserSelector parserSelector,
			String filename, String contentType, String filePath)
			throws SearchLibException {
		try {
			File f = new File(filePath);
			if (f.isDirectory())
				f = new File(f, filename);
			return parserSelector.parseFile(null, filename, contentType, null,
					f, lang);
		} catch (RuntimeException e) {
			throw new SearchLibException(
					"Parser error while getting binary from file : " + filePath
							+ " /" + filename, e);
		} catch (Exception e) {
			throw new SearchLibException(
					"Parser error while getting binary from file : " + filePath
							+ " /" + filename, e);
		}
	}

	public FieldContent getFieldContent(String field) {
		field = field == null ? null : field.intern();
		FieldContent fc = fields.get(field);
		if (fc == null) {
			fc = new FieldContent(field);
			fields.put(field, fc);
		}
		return fc;
	}

	public void add(String field, FieldValueItem fieldValueItem) {
		FieldContent fc = getFieldContent(field);
		fc.add(fieldValueItem);
		fieldContentArray = null;
	}

	public void add(String field, String value, Float boost) {
		if (value == null || value.length() == 0)
			return;
		add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value,
				boost));
	}

	public void addObject(String field, Object object) {
		if (object == null)
			return;
		addString(field, object.toString());
	}

	public void addString(String field, String value) {
		if (value == null)
			return;
		add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value));
	}

	public void addFieldValueList(String field, List values) {
		if (values == null)
			return;
		for (FieldValueItem value : values)
			add(field, value);
	}

	public void addObjectList(String field, List