
com.jaeksoft.searchlib.crawler.common.database.CommonFieldTarget Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2010-2012 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.common.database;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.util.map.TargetField;
public class CommonFieldTarget extends TargetField {
private boolean removeTag;
private boolean convertHtmlEntities;
private boolean filePath;
private boolean crawlUrl;
private boolean crawlFile;
private String filePathPrefix;
private String findRegexpTag;
private String replaceRegexpTag;
private Matcher findRegexpMatcher;
public CommonFieldTarget(String targetName, boolean removeTag,
boolean convertHtmlEntities, boolean filePath,
String filePathPrefix, boolean crawlFile, boolean crawlUrl,
String findRegexTag, String replaceRegexTag) {
super(targetName);
this.removeTag = removeTag;
this.convertHtmlEntities = convertHtmlEntities;
this.filePathPrefix = filePathPrefix;
this.filePath = filePath;
this.crawlFile = crawlFile;
this.crawlUrl = crawlUrl;
this.findRegexpTag = findRegexTag;
this.replaceRegexpTag = replaceRegexTag;
checkRegexpPattern();
}
public CommonFieldTarget(CommonFieldTarget from) {
super(from.getName());
this.copy(from);
}
public void copy(CommonFieldTarget from) {
this.setName(from.getName());
this.removeTag = from.removeTag;
this.convertHtmlEntities = from.convertHtmlEntities;
this.filePathPrefix = from.filePathPrefix;
this.filePath = from.filePath;
this.crawlFile = from.crawlFile;
this.crawlUrl = from.crawlUrl;
this.findRegexpTag = from.findRegexpTag;
this.replaceRegexpTag = from.replaceRegexpTag;
checkRegexpPattern();
}
public CommonFieldTarget(String targetName, Node targetNode) {
super(targetName);
removeTag = false;
convertHtmlEntities = false;
List nodeList = DomUtils.getNodes(targetNode, "filter");
for (Node node : nodeList) {
if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node,
"removeTag")))
removeTag = true;
if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node,
"convertHtmlEntities")))
convertHtmlEntities = true;
if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node,
"filePath")))
filePath = true;
filePathPrefix = DomUtils.getAttributeText(node, "filePathPrefix");
if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node,
"crawlFile")))
crawlFile = true;
if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node,
"crawlUrl")))
crawlUrl = true;
List nl = DomUtils.getNodes(node, "findRegexpTag");
if (nl.size() > 0)
findRegexpTag = StringEscapeUtils.unescapeXml(nl.get(0)
.getTextContent());
nl = DomUtils.getNodes(node, "replaceRegexpTag");
if (nl.size() > 0)
replaceRegexpTag = StringEscapeUtils.unescapeXml(nl.get(0)
.getTextContent());
checkRegexpPattern();
}
}
public void writeXml(XmlWriter xmlWriter) throws SAXException {
xmlWriter.startElement("filter", "removeTag", removeTag ? "yes" : "no",
"convertHtmlEntities", convertHtmlEntities ? "yes" : "no",
"filePath", filePath ? "yes" : "no", "filePathPrefix",
filePathPrefix, "crawlFile", crawlFile ? "yes" : "no",
"crawlUrl", crawlUrl ? "yes" : "no");
if (findRegexpTag != null) {
xmlWriter.startElement("findRegexpTag");
xmlWriter.textNode(StringEscapeUtils.escapeXml(findRegexpTag));
xmlWriter.endElement();
}
if (replaceRegexpTag != null && replaceRegexpTag.length() > 0) {
xmlWriter.startElement("replaceRegexpTag");
xmlWriter.textNode(StringEscapeUtils.escapeXml(replaceRegexpTag));
xmlWriter.endElement();
}
xmlWriter.endElement();
}
/**
* @param removeTag
* the removeTag to set
*/
public void setRemoveTag(boolean removeTag) {
this.removeTag = removeTag;
}
/**
* @return the removeTag
*/
public boolean isRemoveTag() {
return removeTag;
}
/**
* @param convertHtmlEntities
* the convertHtmlEntities to set
*/
public void setConvertHtmlEntities(boolean convertHtmlEntities) {
this.convertHtmlEntities = convertHtmlEntities;
}
/**
* @return the convertHtmlEntities
*/
public boolean isConvertHtmlEntities() {
return convertHtmlEntities;
}
/**
* @return the filePathPrefix
*/
public String getFilePathPrefix() {
return filePathPrefix;
}
public String getFilePath(String content) {
StringBuilder path = new StringBuilder();
if (filePathPrefix != null)
path.append(filePathPrefix);
path.append(content);
return path.toString();
}
/**
* @param filePathPrefix
* the filePathPrefix to set
*/
public void setFilePathPrefix(String filePathPrefix) {
this.filePathPrefix = filePathPrefix;
}
/**
* @return the findRegexpTag
*/
public String getFindRegexpTag() {
return findRegexpTag;
}
/**
* @param findRegexTag
* the findRegexTag to set
*/
public void setFindRegexpTag(String findRegexpTag) {
this.findRegexpTag = findRegexpTag;
checkRegexpPattern();
}
public final boolean hasRegexpPattern() {
return (findRegexpMatcher != null);
}
private void checkRegexpPattern() {
if (findRegexpTag != null)
if (findRegexpTag.trim().length() == 0)
findRegexpTag = null;
findRegexpMatcher = findRegexpTag == null ? null : Pattern.compile(
findRegexpTag).matcher("");
if (replaceRegexpTag == null)
replaceRegexpTag = StringUtils.EMPTY;
}
public final String applyRegexPattern(String text) {
return findRegexpMatcher.reset(text).replaceAll(replaceRegexpTag);
}
/**
* @return the replaceRegexTag
*/
public String getReplaceRegexpTag() {
return replaceRegexpTag;
}
/**
* @param replaceRegexpTag
* the replaceRegexpTag to set
*/
public void setReplaceRegexpTag(String replaceRegexpTag) {
this.replaceRegexpTag = replaceRegexpTag;
}
/**
* @param filePath
* the FilePath to set
*/
public void setFilePath(boolean filePath) {
this.filePath = filePath;
if (!filePath)
setFilePathPrefix(null);
}
/**
* @return the FilePath
*/
public boolean isFilePath() {
return filePath;
}
/**
* @return true if the FilePath is not set
*/
public boolean isNotFilePath() {
return !isFilePath();
}
public boolean isNotFilePathPrefixEditable() {
return !(filePath || crawlFile);
}
/**
* @param crawlUrl
* the crawlUrl to set
*/
public void setCrawlUrl(boolean crawlUrl) {
this.crawlUrl = crawlUrl;
}
/**
* @return the crawlUrl
*/
public boolean isCrawlUrl() {
return crawlUrl;
}
public boolean isNotCrawlUrl() {
return !crawlUrl;
}
/**
* @param crawlFile
* the crawlFile to set
*/
public void setCrawlFile(boolean crawlFile) {
this.crawlFile = crawlFile;
}
/**
* @return the crawlFile
*/
public boolean isCrawlFile() {
return crawlFile;
}
public boolean isNotCrawlFile() {
return !crawlFile;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy