
com.jaeksoft.searchlib.crawler.FieldMapGeneric Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.List;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.xpath.XPathExpressionException;
import net.minidev.json.JSONArray;
import org.apache.commons.lang.StringEscapeUtils;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.crawler.common.database.CommonFieldTarget;
import com.jaeksoft.searchlib.crawler.file.database.FilePathItem;
import com.jaeksoft.searchlib.crawler.file.process.FileInstanceAbstract;
import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType;
import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread;
import com.jaeksoft.searchlib.crawler.web.spider.Crawl;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.Parser;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.schema.FieldValueOriginEnum;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.util.map.GenericLink;
import com.jaeksoft.searchlib.util.map.GenericMap;
import com.jaeksoft.searchlib.util.map.SourceField;
import com.jaeksoft.searchlib.util.map.TargetField;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.PathNotFoundException;
public abstract class FieldMapGeneric
extends GenericMap {
private File mapFile;
protected FieldMapGeneric() {
mapFile = null;
}
protected FieldMapGeneric(Node parentNode) throws XPathExpressionException {
mapFile = null;
load(parentNode);
}
protected FieldMapGeneric(File mapFile, String rootXPath)
throws ParserConfigurationException, SAXException, IOException,
XPathExpressionException {
this.mapFile = mapFile;
if (!mapFile.exists())
return;
XPathParser xpp = new XPathParser(mapFile);
load(xpp.getNode(rootXPath));
}
protected abstract T loadTarget(String targetName, Node node);
protected abstract S loadSource(String source);
public void load(Node parentNode) throws XPathExpressionException {
synchronized (this) {
if (parentNode == null)
return;
List nodeList = DomUtils.getNodes(parentNode, "link");
for (Node node : nodeList) {
String sourceName = StringEscapeUtils.unescapeXml(DomUtils
.getAttributeText(node, "source"));
S source = loadSource(sourceName);
if (source == null)
continue;
String targetName = DomUtils.getAttributeText(node, "target");
T target = loadTarget(targetName, node);
if (target == null)
continue;
add(source, target);
}
}
}
protected abstract void writeTarget(XmlWriter xmlWriter, T target)
throws SAXException;
public void store(XmlWriter xmlWriter) throws SAXException {
for (GenericLink link : getList()) {
T target = link.getTarget();
xmlWriter.startElement(
"link",
"source",
link.getSource().toXmlAttribute(),
"target",
target.toXmlAttribute(),
"analyzer",
target.getAnalyzer(),
"boost",
target.getBoost() == null ? null : Float.toString(target
.getBoost()));
writeTarget(xmlWriter, link.getTarget());
xmlWriter.endElement();
}
}
public void store() throws TransformerConfigurationException, SAXException,
IOException {
synchronized (this) {
if (!mapFile.exists())
mapFile.createNewFile();
PrintWriter pw = new PrintWriter(mapFile);
try {
XmlWriter xmlWriter = new XmlWriter(pw, "UTF-8");
xmlWriter.startElement("map");
store(xmlWriter);
xmlWriter.endElement();
xmlWriter.endDocument();
} finally {
pw.close();
}
}
}
final protected void mapFieldTarget(FieldMapContext context,
FieldContent fc, CommonFieldTarget targetField,
IndexDocument target, Set filePathSet) throws IOException,
SearchLibException, ParseException, SyntaxError,
URISyntaxException, ClassNotFoundException, InterruptedException,
InstantiationException, IllegalAccessException {
if (fc == null)
return;
for (FieldValueItem fvi : fc.getValues())
mapFieldTarget(context, targetField, fvi.value, target, filePathSet);
}
final public String mapFieldTarget(CommonFieldTarget dfTarget,
String content) {
if (StringUtils.isEmpty(content))
return null;
if (dfTarget.isConvertHtmlEntities())
content = StringEscapeUtils.unescapeHtml(content);
if (dfTarget.isRemoveTag())
content = StringUtils.removeTag(content);
if (dfTarget.hasRegexpPattern())
content = dfTarget.applyRegexPattern(content);
return content;
}
final protected void mapFieldTarget(FieldMapContext context,
CommonFieldTarget dfTarget, String content, IndexDocument target,
Set filePathSet) throws SearchLibException, IOException,
ParseException, SyntaxError, URISyntaxException,
ClassNotFoundException, InterruptedException,
InstantiationException, IllegalAccessException {
if (dfTarget == null)
return;
if (StringUtils.isEmpty(content))
return;
if (dfTarget.isFilePath()) {
String filePath = dfTarget.getFilePath(content);
if (filePathSet == null || !filePathSet.contains(filePath)) {
filePathSet.add(filePath);
File file = new File(filePath);
if (file.exists()) {
Parser parser = context.parserSelector.parseFile(null,
file.getName(), null, null, file, context.lang);
if (parser != null)
parser.popupateResult(0, target);
} else {
Logging.error("File don't exist:" + file.getAbsolutePath());
}
}
}
if (dfTarget.isCrawlFile()) {
String filePathName = dfTarget.getFilePathPrefix();
if (filePathSet == null || !filePathSet.contains(content)) {
filePathSet.add(content);
URI filePathURI = new URI(filePathName);
FilePathItem filePathItem = context.filePathManager.findFirst(
filePathURI.getScheme(), filePathURI.getHost());
if (filePathItem == null)
throw new SearchLibException("FilePathItem not found: "
+ filePathName);
FileInstanceAbstract fileInstance = FileInstanceAbstract
.create(filePathItem, null, filePathItem.getPath()
+ content);
Parser parser = context.parserSelector
.parseStream(null, fileInstance.getFileName(), null,
null, fileInstance.getInputStream(),
context.lang, null, null);
if (parser != null)
parser.popupateResult(0, target);
}
}
if (dfTarget.isCrawlUrl()) {
WebCrawlThread crawlThread = context.webCrawlMaster.manualCrawl(
LinkUtils.newEncodedURL(content), ListType.DBCRAWL);
crawlThread.waitForStart(60);
crawlThread.waitForEnd(60);
Crawl crawl = crawlThread.getCurrentCrawl();
if (crawl != null) {
IndexDocument targetIndexDocument = crawl
.getTargetIndexDocument(0);
if (targetIndexDocument != null)
target.add(targetIndexDocument);
}
}
content = mapFieldTarget(dfTarget, content);
target.add(dfTarget.getName(), new FieldValueItem(
FieldValueOriginEnum.EXTERNAL, content));
}
public void mapJson(FieldMapContext context, Object jsonObject,
IndexDocument target) throws SearchLibException, IOException,
ParseException, SyntaxError, URISyntaxException,
ClassNotFoundException, InterruptedException,
InstantiationException, IllegalAccessException {
for (GenericLink link : getList()) {
String jsonPath = link.getSource().getUniqueName();
try {
Object jsonContent = JsonPath.read(jsonObject, jsonPath);
if (jsonContent == null)
continue;
if (jsonContent instanceof JSONArray) {
JSONArray jsonArray = (JSONArray) jsonContent;
for (Object content : jsonArray) {
if (content != null)
mapFieldTarget(context,
(CommonFieldTarget) link.getTarget(),
content.toString(), target, null);
}
} else
mapFieldTarget(context,
(CommonFieldTarget) link.getTarget(),
jsonContent.toString(), target, null);
} catch (PathNotFoundException e) {
continue;
} catch (IllegalArgumentException e) {
Logging.warn(e);
continue;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy