All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.common.database.CommonFieldTarget Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2010-2012 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.common.database;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.StringUtils;
import com.jaeksoft.searchlib.util.XmlWriter;
import com.jaeksoft.searchlib.util.map.TargetField;

public class CommonFieldTarget extends TargetField {

	private boolean removeTag;

	private boolean convertHtmlEntities;

	private boolean filePath;

	private boolean crawlUrl;

	private boolean crawlFile;

	private String filePathPrefix;

	private String findRegexpTag;

	private String replaceRegexpTag;

	private Matcher findRegexpMatcher;

	public CommonFieldTarget(String targetName, boolean removeTag, boolean convertHtmlEntities, boolean filePath,
			String filePathPrefix, boolean crawlFile, boolean crawlUrl, String findRegexTag, String replaceRegexTag) {
		super(targetName);
		this.removeTag = removeTag;
		this.convertHtmlEntities = convertHtmlEntities;
		this.filePathPrefix = filePathPrefix;
		this.filePath = filePath;
		this.crawlFile = crawlFile;
		this.crawlUrl = crawlUrl;
		this.findRegexpTag = findRegexTag;
		this.replaceRegexpTag = replaceRegexTag;
		checkRegexpPattern();
	}

	public CommonFieldTarget(CommonFieldTarget from) {
		super(from.getName());
		this.copy(from);
	}

	public void copy(CommonFieldTarget from) {
		this.setName(from.getName());
		this.removeTag = from.removeTag;
		this.convertHtmlEntities = from.convertHtmlEntities;
		this.filePathPrefix = from.filePathPrefix;
		this.filePath = from.filePath;
		this.crawlFile = from.crawlFile;
		this.crawlUrl = from.crawlUrl;
		this.findRegexpTag = from.findRegexpTag;
		this.replaceRegexpTag = from.replaceRegexpTag;
		checkRegexpPattern();
	}

	public CommonFieldTarget(String targetName, Node targetNode) {
		super(targetName);
		removeTag = false;
		convertHtmlEntities = false;
		List nodeList = DomUtils.getNodes(targetNode, "filter");
		for (Node node : nodeList) {
			if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node, "removeTag")))
				removeTag = true;
			if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node, "convertHtmlEntities")))
				convertHtmlEntities = true;
			if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node, "filePath")))
				filePath = true;
			filePathPrefix = DomUtils.getAttributeText(node, "filePathPrefix");
			if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node, "crawlFile")))
				crawlFile = true;
			if ("yes".equalsIgnoreCase(DomUtils.getAttributeText(node, "crawlUrl")))
				crawlUrl = true;
			List nl = DomUtils.getNodes(node, "findRegexpTag");
			if (nl.size() > 0)
				findRegexpTag = StringEscapeUtils.unescapeXml(nl.get(0).getTextContent());
			nl = DomUtils.getNodes(node, "replaceRegexpTag");
			if (nl.size() > 0)
				replaceRegexpTag = StringEscapeUtils.unescapeXml(nl.get(0).getTextContent());
			checkRegexpPattern();
		}
	}

	public void writeXml(XmlWriter xmlWriter) throws SAXException {
		xmlWriter.startElement("filter", "removeTag", removeTag ? "yes" : "no", "convertHtmlEntities",
				convertHtmlEntities ? "yes" : "no", "filePath", filePath ? "yes" : "no", "filePathPrefix",
				filePathPrefix, "crawlFile", crawlFile ? "yes" : "no", "crawlUrl", crawlUrl ? "yes" : "no");
		if (findRegexpTag != null) {
			xmlWriter.startElement("findRegexpTag");
			xmlWriter.textNode(StringEscapeUtils.escapeXml(findRegexpTag));
			xmlWriter.endElement();
		}
		if (replaceRegexpTag != null && replaceRegexpTag.length() > 0) {
			xmlWriter.startElement("replaceRegexpTag");
			xmlWriter.textNode(StringEscapeUtils.escapeXml(replaceRegexpTag));
			xmlWriter.endElement();
		}
		xmlWriter.endElement();
	}

	/**
	 * @param removeTag
	 *            the removeTag to set
	 */
	public void setRemoveTag(boolean removeTag) {
		this.removeTag = removeTag;
	}

	/**
	 * @return the removeTag
	 */
	public boolean isRemoveTag() {
		return removeTag;
	}

	/**
	 * @param convertHtmlEntities
	 *            the convertHtmlEntities to set
	 */
	public void setConvertHtmlEntities(boolean convertHtmlEntities) {
		this.convertHtmlEntities = convertHtmlEntities;
	}

	/**
	 * @return the convertHtmlEntities
	 */
	public boolean isConvertHtmlEntities() {
		return convertHtmlEntities;
	}

	/**
	 * @return the filePathPrefix
	 */
	public String getFilePathPrefix() {
		return filePathPrefix;
	}

	public String getFilePath(String content) {
		StringBuilder path = new StringBuilder();
		if (filePathPrefix != null)
			path.append(filePathPrefix);
		path.append(content);
		return path.toString();
	}

	/**
	 * @param filePathPrefix
	 *            the filePathPrefix to set
	 */
	public void setFilePathPrefix(String filePathPrefix) {
		this.filePathPrefix = filePathPrefix;
	}

	/**
	 * @return the findRegexpTag
	 */
	public String getFindRegexpTag() {
		return findRegexpTag;
	}

	/**
	 * @param findRegexpTag
	 *            the findRegexTag to set
	 */
	public void setFindRegexpTag(String findRegexpTag) {
		this.findRegexpTag = findRegexpTag;
		checkRegexpPattern();
	}

	public final boolean hasRegexpPattern() {
		return (findRegexpMatcher != null);
	}

	private void checkRegexpPattern() {
		if (findRegexpTag != null)
			if (findRegexpTag.trim().length() == 0)
				findRegexpTag = null;
		findRegexpMatcher = findRegexpTag == null ? null : Pattern.compile(findRegexpTag).matcher("");
		if (replaceRegexpTag == null)
			replaceRegexpTag = StringUtils.EMPTY;
	}

	public final String applyRegexPattern(String text) {
		return findRegexpMatcher.reset(text).replaceAll(replaceRegexpTag);
	}

	/**
	 * @return the replaceRegexTag
	 */
	public String getReplaceRegexpTag() {
		return replaceRegexpTag;
	}

	/**
	 * @param replaceRegexpTag
	 *            the replaceRegexpTag to set
	 */
	public void setReplaceRegexpTag(String replaceRegexpTag) {
		this.replaceRegexpTag = replaceRegexpTag;
	}

	/**
	 * @param filePath
	 *            the FilePath to set
	 */
	public void setFilePath(boolean filePath) {
		this.filePath = filePath;
		if (!filePath)
			setFilePathPrefix(null);
	}

	/**
	 * @return the FilePath
	 */
	public boolean isFilePath() {
		return filePath;
	}

	/**
	 * @return true if the FilePath is not set
	 */
	public boolean isNotFilePath() {
		return !isFilePath();
	}

	public boolean isNotFilePathPrefixEditable() {
		return !(filePath || crawlFile);
	}

	/**
	 * @param crawlUrl
	 *            the crawlUrl to set
	 */
	public void setCrawlUrl(boolean crawlUrl) {
		this.crawlUrl = crawlUrl;
	}

	/**
	 * @return the crawlUrl
	 */
	public boolean isCrawlUrl() {
		return crawlUrl;
	}

	public boolean isNotCrawlUrl() {
		return !crawlUrl;
	}

	/**
	 * @param crawlFile
	 *            the crawlFile to set
	 */
	public void setCrawlFile(boolean crawlFile) {
		this.crawlFile = crawlFile;
	}

	/**
	 * @return the crawlFile
	 */
	public boolean isCrawlFile() {
		return crawlFile;
	}

	public boolean isNotCrawlFile() {
		return !crawlFile;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy