All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.FieldMapGeneric Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2010-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Set;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import net.minidev.json.JSONArray;

import org.apache.commons.lang.StringEscapeUtils;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.common.database.CommonFieldTarget;
import com.jaeksoft.searchlib.crawler.file.database.FilePathItem;
import com.jaeksoft.searchlib.crawler.file.database.FileTypeEnum;
import com.jaeksoft.searchlib.crawler.file.process.FileInstanceAbstract;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.schema.FieldValueOriginEnum;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.util.map.GenericLink;
import com.jaeksoft.searchlib.util.map.GenericMap;
import com.jaeksoft.searchlib.util.map.SourceField;
import com.jaeksoft.searchlib.util.map.TargetField;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;

public abstract class FieldMapGeneric
		extends GenericMap {

	private File mapFile;

	protected FieldMapGeneric() {
		mapFile = null;
	}

	protected FieldMapGeneric(Node parentNode) throws XPathExpressionException {
		mapFile = null;
		load(parentNode);
	}

	protected FieldMapGeneric(File mapFile, String rootXPath)
			throws ParserConfigurationException, SAXException, IOException,
			XPathExpressionException {
		this.mapFile = mapFile;
		if (!mapFile.exists())
			return;
		XPathParser xpp = new XPathParser(mapFile);
		load(xpp.getNode(rootXPath));
	}

	protected abstract T loadTarget(String targetName, Node node);

	protected abstract S loadSource(String source);

	public void load(Node parentNode) throws XPathExpressionException {
		synchronized (this) {
			if (parentNode == null)
				return;
			List nodeList = DomUtils.getNodes(parentNode, "link");
			for (Node node : nodeList) {
				String sourceName = StringEscapeUtils.unescapeXml(DomUtils
						.getAttributeText(node, "source"));
				S source = loadSource(sourceName);
				if (source == null)
					continue;
				String targetName = DomUtils.getAttributeText(node, "target");
				T target = loadTarget(targetName, node);
				if (target == null)
					continue;
				add(source, target);
			}
		}
	}

	protected abstract void writeTarget(XmlWriter xmlWriter, T target)
			throws SAXException;

	public void store(XmlWriter xmlWriter) throws SAXException {
		for (GenericLink link : getList()) {
			T target = link.getTarget();
			xmlWriter.startElement(
					"link",
					"source",
					link.getSource().toXmlAttribute(),
					"target",
					target.toXmlAttribute(),
					"analyzer",
					target.getAnalyzer(),
					"boost",
					target.getBoost() == null ? null : Float.toString(target
							.getBoost()));
			writeTarget(xmlWriter, link.getTarget());
			xmlWriter.endElement();
		}
	}

	public void store() throws TransformerConfigurationException, SAXException,
			IOException {
		synchronized (this) {
			if (!mapFile.exists())
				mapFile.createNewFile();
			PrintWriter pw = new PrintWriter(mapFile);
			try {
				XmlWriter xmlWriter = new XmlWriter(pw, "UTF-8");
				xmlWriter.startElement("map");
				store(xmlWriter);
				xmlWriter.endElement();
				xmlWriter.endDocument();
			} finally {
				pw.close();
			}
		}
	}

	final protected void mapFieldTarget(FieldMapContext context,
			FieldContent fc, CommonFieldTarget targetField,
			IndexDocument target, Set filePathSet) throws IOException,
			SearchLibException, ParseException, SyntaxError,
			URISyntaxException, ClassNotFoundException, InterruptedException,
			InstantiationException, IllegalAccessException {
		if (fc == null)
			return;
		for (FieldValueItem fvi : fc.getValues())
			mapFieldTarget(context, targetField, fvi.value, target, filePathSet);
	}

	final public String mapFieldTarget(CommonFieldTarget dfTarget,
			String content) {
		if (StringUtils.isEmpty(content))
			return null;
		if (dfTarget.isConvertHtmlEntities())
			content = StringEscapeUtils.unescapeHtml(content);
		if (dfTarget.isRemoveTag())
			content = StringUtils.removeTag(content);
		if (dfTarget.hasRegexpPattern())
			content = dfTarget.applyRegexPattern(content);
		return content;
	}

	final protected void mapFieldTarget(FieldMapContext context,
			CommonFieldTarget dfTarget, String content, IndexDocument target,
			Set filePathSet) throws SearchLibException, IOException,
			ParseException, SyntaxError, URISyntaxException,
			ClassNotFoundException, InterruptedException,
			InstantiationException, IllegalAccessException {
		if (dfTarget == null)
			return;
		if (StringUtils.isEmpty(content))
			return;
		if (dfTarget.isFilePath()) {
			String filePath = dfTarget.getFilePath(content);
			if (filePathSet == null || !filePathSet.contains(filePath)) {
				filePathSet.add(filePath);
				File file = new File(filePath);
				if (file.exists()) {
					Parser parser = context.parserSelector.parseFile(null,
							file.getName(), null, null, file, context.lang);
					if (parser != null)
						parser.popupateResult(0, target);
				} else {
					Logging.error("File don't exist:" + file.getAbsolutePath());
				}
			}
		}
		if (dfTarget.isCrawlFile()) {
			String filePathName = dfTarget.getFilePathPrefix();
			if (filePathSet == null || !filePathSet.contains(content)) {
				filePathSet.add(content);
				URI filePathURI = new URI(filePathName);
				FilePathItem filePathItem = context.filePathManager.findFirst(
						filePathURI.getScheme(), filePathURI.getHost());
				if (filePathItem == null)
					throw new SearchLibException("FilePathItem not found: "
							+ filePathName);
				FileInstanceAbstract fileInstance = FileInstanceAbstract
						.create(filePathItem, null, filePathItem.getPath()
								+ content);
				FileTypeEnum type = fileInstance.getFileType();
				if (type != null && type == FileTypeEnum.file) {
					Parser parser = context.parserSelector.parseStream(null,
							fileInstance.getFileName(), null, null,
							fileInstance.getInputStream(), context.lang, null,
							null);
					if (parser != null)
						parser.popupateResult(0, target);
				}
			}
		}
		if (dfTarget.isCrawlUrl()) {
			WebCrawlThread crawlThread = context.webCrawlMaster.manualCrawl(
					LinkUtils.newEncodedURL(content), ListType.DBCRAWL);
			crawlThread.waitForStart(60);
			crawlThread.waitForEnd(60);
			Crawl crawl = crawlThread.getCurrentCrawl();
			if (crawl != null) {
				IndexDocument targetIndexDocument = crawl
						.getTargetIndexDocument(0);
				if (targetIndexDocument != null)
					target.add(targetIndexDocument);
			}
		}
		content = mapFieldTarget(dfTarget, content);
		target.add(dfTarget.getName(), new FieldValueItem(
				FieldValueOriginEnum.EXTERNAL, content));
	}

	public void mapJson(FieldMapContext context, Object jsonObject,
			IndexDocument target) throws SearchLibException, IOException,
			ParseException, SyntaxError, URISyntaxException,
			ClassNotFoundException, InterruptedException,
			InstantiationException, IllegalAccessException {
		for (GenericLink link : getList()) {
			String jsonPath = link.getSource().getUniqueName();
			try {
				Object jsonContent = JsonPath.read(jsonObject, jsonPath);
				if (jsonContent == null)
					continue;
				if (jsonContent instanceof JSONArray) {
					JSONArray jsonArray = (JSONArray) jsonContent;
					for (Object content : jsonArray) {
						if (content != null)
							mapFieldTarget(context,
									(CommonFieldTarget) link.getTarget(),
									content.toString(), target, null);
					}
				} else
					mapFieldTarget(context,
							(CommonFieldTarget) link.getTarget(),
							jsonContent.toString(), target, null);
			} catch (PathNotFoundException e) {
				continue;
			} catch (IllegalArgumentException e) {
				Logging.warn(e);
				continue;
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy