
com.jaeksoft.searchlib.parser.ParserFactory Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.parser;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import javax.xml.xpath.XPathExpressionException;
import org.w3c.dom.DOMException;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassFactory;
import com.jaeksoft.searchlib.analysis.ClassProperty;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
public class ParserFactory extends ClassFactory implements
Comparable {
final private static String PARSER_PACKAGE = "com.jaeksoft.searchlib.parser";
private Set mimeTypeList;
private Map urlPatternList;
private Set extensionList;
private ParserFieldMap fieldMap;
private UrlFilterItem[] urlFilterList;
private ParserFieldEnum[] fieldList;
private ParserType parserType;
protected final boolean externalAllowed;
protected ParserFactory(ParserFieldEnum[] fieldList, boolean externalAllowed) {
this.externalAllowed = externalAllowed;
this.fieldList = fieldList;
this.parserType = null;
this.fieldMap = null;
urlFilterList = null;
mimeTypeList = null;
extensionList = null;
}
@Override
protected void initProperties() throws SearchLibException {
addProperty(ClassPropertyEnum.PARSER_NAME, "", null, 20, 1);
addProperty(ClassPropertyEnum.PARSER_FAIL_OVER_NAME, "", null, 20, 1);
}
public ParserFieldEnum[] getFieldList() {
return fieldList;
}
public String getParserName() {
return getProperty(ClassPropertyEnum.PARSER_NAME).getValue();
}
public String getFailOverParserName() {
return getProperty(ClassPropertyEnum.PARSER_FAIL_OVER_NAME).getValue();
}
public ParserType getParserType() {
if (parserType != null)
return parserType;
if (config == null)
return null;
parserType = ParserTypeEnum.INSTANCE.find(this.getClass());
return parserType;
}
public void setParserName(String parserName) throws SearchLibException {
getProperty(ClassPropertyEnum.PARSER_NAME).setValue(parserName);
}
public void setFailOverParserName(String parserName)
throws SearchLibException {
getProperty(ClassPropertyEnum.PARSER_FAIL_OVER_NAME).setValue(
parserName);
}
public int getSizeLimit() {
ClassProperty prop = getProperty(ClassPropertyEnum.SIZE_LIMIT);
if (prop == null)
return 0;
return Integer.parseInt(prop.getValue());
}
public ParserFieldMap getFieldMap() {
if (fieldMap == null)
fieldMap = new ParserFieldMap();
return fieldMap;
}
public void addExtension(String extension) {
synchronized (this) {
if (extensionList == null)
extensionList = new TreeSet();
extensionList.add(extension);
}
}
public void removeExtension(String extension) {
synchronized (this) {
if (extensionList != null)
extensionList.remove(extension);
}
}
public void addMimeType(String mimeType) {
synchronized (this) {
if (mimeTypeList == null)
mimeTypeList = new TreeSet();
mimeTypeList.add(mimeType);
}
}
public void removeMimeType(String mimeType) {
synchronized (this) {
if (mimeTypeList != null)
mimeTypeList.remove(mimeType);
}
}
public void addUrlPattern(String urlPattern) {
synchronized (this) {
if (urlPattern == null)
return;
urlPattern = urlPattern.trim();
Pattern pattern = StringUtils.wildcardPattern(urlPattern);
if (urlPatternList == null)
urlPatternList = new TreeMap();
urlPatternList.put(urlPattern, pattern);
}
}
public void removeUrlPattern(String urlPattern) {
synchronized (this) {
if (urlPattern == null)
return;
urlPattern = urlPattern.trim();
if (urlPatternList != null)
urlPatternList.remove(urlPattern);
}
}
/**
* Create a new ParserFactory by reading the attributes of an XML node
*
* @param config
* @param node
* @return a ParserFactory
* @throws SearchLibException
* @throws XPathExpressionException
* @throws ClassNotFoundException
* @throws DOMException
*/
public static ParserFactory create(Config config, XPathParser xpp,
Node parserNode) throws SearchLibException,
XPathExpressionException, DOMException, ClassNotFoundException {
ParserFactory parserFactory = (ParserFactory) ClassFactory.create(
config, PARSER_PACKAGE, parserNode, "attributes");
parserFactory.fieldMap = new ParserFieldMap(xpp.getNode(parserNode,
"map"));
NodeList mimeNodes = xpp.getNodeList(parserNode, "contentType");
for (int j = 0; j < mimeNodes.getLength(); j++) {
Node mimeNode = mimeNodes.item(j);
String contentType = xpp.getNodeString(mimeNode, false);
parserFactory.addMimeType(contentType);
}
NodeList urlPatternNodes = xpp.getNodeList(parserNode, "urlPattern");
for (int j = 0; j < urlPatternNodes.getLength(); j++) {
Node urlPatternNode = urlPatternNodes.item(j);
String urlPattern = xpp.getNodeString(urlPatternNode, false);
parserFactory.addUrlPattern(urlPattern);
}
NodeList extensionNodes = xpp.getNodeList(parserNode, "extension");
for (int j = 0; j < extensionNodes.getLength(); j++) {
Node extensionNode = extensionNodes.item(j);
String extension = xpp.getNodeString(extensionNode, false);
parserFactory.addExtension(extension);
}
return parserFactory;
}
public static ParserFactory create(Config config, String parserName,
String className) throws SearchLibException, ClassNotFoundException {
ParserFactory parserFactory = (ParserFactory) ClassFactory.create(null,
PARSER_PACKAGE, className);
parserFactory.config = config;
parserFactory.setParserName(parserName);
return parserFactory;
}
/**
* Clone a Parser
*
* @param filter
* @return a FilterFactory
* @throws SearchLibException
* @throws ClassNotFoundException
*/
public static ParserFactory create(ParserFactory parser)
throws SearchLibException, ClassNotFoundException {
ParserFactory newParser = (ParserFactory) ClassFactory.create(parser);
newParser.fieldMap = new ParserFieldMap();
if (parser.fieldMap != null)
parser.fieldMap.copyTo(newParser.fieldMap);
if (parser.config != null)
newParser.setUrlFilterList(parser.config.getUrlFilterList()
.getArray());
if (parser.extensionList != null)
newParser.extensionList = new TreeSet(parser.extensionList);
if (parser.mimeTypeList != null)
newParser.mimeTypeList = new TreeSet(parser.mimeTypeList);
if (parser.urlPatternList != null)
newParser.urlPatternList = new TreeMap(
parser.urlPatternList);
return newParser;
}
public Set getExtensionSet() {
return extensionList;
}
public Set getMimeTypeSet() {
return mimeTypeList;
}
public Set getUrlPatternSet() {
if (urlPatternList == null)
return null;
return urlPatternList.keySet();
}
public boolean matchUrlPattern(String url) {
if (url == null) {
if (urlPatternList == null)
return true;
return urlPatternList.size() == 0;
}
if (urlPatternList == null)
return false;
for (Pattern pattern : urlPatternList.values())
if (pattern.matcher(url).matches())
return true;
return false;
}
/**
* @param urlFilterList
* the urlFilterList to set
*/
public void setUrlFilterList(UrlFilterItem[] urlFilterList) {
this.urlFilterList = urlFilterList;
}
/**
* @return the urlFilterList
*/
public UrlFilterItem[] getUrlFilterList() {
return urlFilterList;
}
@Override
public int compareTo(ParserFactory parserFactory) {
int c;
if ((c = getParserName().compareTo(parserFactory.getParserName())) != 0)
return c;
return getClassName().compareTo(parserFactory.getClassName());
}
public void writeXmlConfig(XmlWriter xmlWriter) throws SAXException {
xmlWriter.startElement("parser", getXmlAttributes());
writeXmlNodeAttributes(xmlWriter, "attributes");
if (mimeTypeList != null) {
for (String mimeType : mimeTypeList) {
xmlWriter.startElement("contentType");
xmlWriter.textNode(mimeType);
xmlWriter.endElement();
}
}
if (urlPatternList != null) {
for (String urlPattern : urlPatternList.keySet()) {
xmlWriter.startElement("urlPattern");
xmlWriter.textNode(urlPattern);
xmlWriter.endElement();
}
}
if (extensionList != null) {
for (String extension : extensionList) {
xmlWriter.startElement("extension");
xmlWriter.textNode(extension);
xmlWriter.endElement();
}
}
if (fieldMap != null) {
xmlWriter.startElement("map");
fieldMap.store(xmlWriter);
xmlWriter.endElement();
}
xmlWriter.endElement();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy