All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.snippet.SnippetField Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

There is a newer version: 1.5.14
Show newest version
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.snippet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.lang.StringEscapeUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.Query;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.ReaderInterface;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.schema.AbstractField;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.schema.FieldValueOriginEnum;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.schema.SchemaFieldList;
import com.jaeksoft.searchlib.snippet.SnippetVectors.SnippetVector;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.Timer;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;

public class SnippetField extends AbstractField {

	/**
	 * 
	 */
	private static final long serialVersionUID = 1989504404725110730L;

	private FragmenterAbstract fragmenterTemplate;
	private String tag;
	private String[] tags;
	private String separator;
	private String unescapedSeparator;
	private int maxSnippetSize;
	private int maxSnippetNumber;
	private int timeLimit;
	private transient SnippetQueries snippetQueries;
	private transient Query query;
	private transient Analyzer analyzer;

	private SnippetField(String fieldName, String tag, String separator,
			int maxSnippetSize, int maxSnippetNumber,
			FragmenterAbstract fragmenterTemplate, int timeLimit) {
		super(fieldName);
		this.snippetQueries = null;
		setTag(tag);
		setSeparator(separator);
		this.maxSnippetSize = maxSnippetSize;
		this.maxSnippetNumber = maxSnippetNumber;
		this.fragmenterTemplate = fragmenterTemplate;
		this.timeLimit = timeLimit;
	}

	public SnippetField(String fieldName) {
		this(fieldName, "em", "...", 200, 1, FragmenterAbstract.NOFRAGMENTER, 0);
	}

	@Override
	public SnippetField duplicate() {
		return new SnippetField(name, tag, separator, maxSnippetSize,
				maxSnippetNumber, fragmenterTemplate, timeLimit);
	}

	public String getFragmenter() {
		return fragmenterTemplate.getClass().getSimpleName();
	}

	public void setFragmenter(String fragmenterName) throws SearchLibException {
		try {
			fragmenterTemplate = FragmenterAbstract.newInstance(fragmenterName);
		} catch (InstantiationException e) {
			throw new SearchLibException(e);
		} catch (IllegalAccessException e) {
			throw new SearchLibException(e);
		}
	}

	/**
	 * @return the tag
	 */
	public String getTag() {
		return tag;
	}

	/**
	 * @param tag
	 *            the tag to set
	 */
	public void setTag(String tag) {
		this.tag = tag;
		if (tag != null && tag.length() > 0) {
			tags = new String[2];
			tags[0] = '<' + tag + '>';
			tags[1] = "';
		} else
			tags = null;
	}

	/**
	 * @return the separator
	 */
	public String getSeparator() {
		return separator;
	}

	/**
	 * @param separator
	 *            the separator to set
	 */
	public void setSeparator(String separator) {
		this.separator = separator;
		unescapedSeparator = separator == null ? null : StringEscapeUtils
				.unescapeHtml(separator);
	}

	/**
	 * @return the maxSnippetSize
	 */
	public int getMaxSnippetSize() {
		return maxSnippetSize;
	}

	/**
	 * @param maxSnippetSize
	 *            the maxSnippetSize to set
	 */
	public void setMaxSnippetSize(int maxSnippetSize) {
		this.maxSnippetSize = maxSnippetSize;
	}

	/**
	 * @return the maxSnippetNumber
	 */
	public int getMaxSnippetNumber() {
		return maxSnippetNumber;
	}

	/**
	 * @param maxSnippetNumber
	 *            the maxSnippetNumber to set
	 */
	public void setMaxSnippetNumber(int maxSnippetNumber) {
		this.maxSnippetNumber = maxSnippetNumber;
	}

	/**
	 * Retourne la liste des champs "snippet".
	 * 
	 * @param xPath
	 * @param node
	 * @param target
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 */
	public static void copySnippetFields(Node node, SchemaFieldList source,
			SnippetFieldList target) throws InstantiationException,
			IllegalAccessException {
		String fieldName = XPathParser.getAttributeString(node, "name");
		String tag = XPathParser.getAttributeString(node, "tag");
		if (tag == null)
			tag = "em";
		int maxSnippetNumber = XPathParser.getAttributeValue(node,
				"maxSnippetNumber");
		if (maxSnippetNumber == 0)
			maxSnippetNumber = 1;
		int maxSnippetSize = XPathParser.getAttributeValue(node,
				"maxSnippetSize");
		if (maxSnippetSize == 0)
			maxSnippetSize = 200;
		int timeLimit = DomUtils.getAttributeInteger(node, "timeLimit", 0);

		FragmenterAbstract fragmenter = FragmenterAbstract
				.newInstance(XPathParser.getAttributeString(node,
						"fragmenterClass"));
		fragmenter.setAttributes(node.getAttributes());
		String separator = XPathParser.getAttributeString(node, "separator");
		if (separator == null)
			separator = "...";
		SchemaField schemaField = source.get(fieldName);
		if (schemaField == null)
			return;
		SnippetField field = new SnippetField(schemaField.getName(), tag,
				separator, maxSnippetSize, maxSnippetNumber, fragmenter,
				timeLimit);
		target.put(field);
	}

	public final void reset() {
		snippetQueries = null;
		query = null;
		analyzer = null;
	}

	public void initSearchTerms(AbstractSearchRequest searchRequest)
			throws ParseException, SyntaxError, IOException, SearchLibException {
		synchronized (this) {
			if (snippetQueries != null)
				return;
			this.query = searchRequest.getSnippetQuery();
			this.analyzer = searchRequest.getAnalyzer();
			snippetQueries = new SnippetQueries(this.query, name);
		}
	}

	private final void appendSubString(String text, int start, int end,
			StringBuilder sb) {
		if (text == null)
			return;
		int l = text.length();
		if (end > l)
			end = l;
		if (end < start)
			return;
		sb.append(text.substring(start, end));
	}

	private final SnippetVector checkValue(SnippetVector currentVector,
			Iterator vectorIterator, int startOffset,
			Fragment fragment) {
		if (currentVector == null)
			return null;
		StringBuilder result = new StringBuilder();
		String originalText = fragment.getOriginalText();
		int originalTextLength = originalText.length();
		int endOffset = startOffset + originalTextLength;
		int pos = 0;
		while (currentVector != null) {
			int end = currentVector.end - fragment.vectorOffset;
			if (end > endOffset)
				break;
			int start = currentVector.start - fragment.vectorOffset;
			if (start >= startOffset) {
				appendSubString(originalText, pos, start - startOffset, result);
				if (tags != null)
					result.append(tags[0]);
				appendSubString(originalText, start - startOffset, end
						- startOffset, result);
				if (tags != null)
					result.append(tags[1]);
				pos = end - startOffset;
			}
			currentVector = vectorIterator.hasNext() ? vectorIterator.next()
					: null;
		}
		if (result.length() == 0)
			return currentVector;
		if (pos < originalTextLength)
			appendSubString(originalText, pos, originalTextLength, result);
		fragment.setHighlightedText(result.toString());
		return currentVector;
	}

	public boolean getSnippets(final int docId, final ReaderInterface reader,
			final List values,
			final List snippets, final Timer parentTimer)
			throws IOException, ParseException, SyntaxError, SearchLibException {

		if (values == null)
			return false;

		final Timer timer = new Timer(parentTimer, "SnippetField " + this.name);
		final long halfTimeExpiration = this.timeLimit == 0 ? 0 : timer
				.getStartOffset(this.timeLimit / 2);
		final long expiration = this.timeLimit == 0 ? 0 : timer
				.getStartOffset(this.timeLimit);

		FragmenterAbstract fragmenter = fragmenterTemplate.newInstance();
		SnippetVector currentVector = null;

		Timer t = new Timer(timer, "extractTermVectorIterator");

		Iterator vectorIterator = SnippetVectors
				.extractTermVectorIterator(docId, reader, snippetQueries, name,
						t, halfTimeExpiration);
		if (vectorIterator != null)
			currentVector = vectorIterator.hasNext() ? vectorIterator.next()
					: null;

		t.end(null);

		t = new Timer(timer, "getFraments");

		int startOffset = 0;
		FragmentList fragments = new FragmentList();
		int vectorOffset = 0;
		for (FieldValueItem valueItem : values) {
			String value = valueItem.getValue();
			if (value != null) {
				// VectorOffset++ depends of EndOffset bug #patch Lucene 579 and
				// 1458
				fragmenter.getFragments(value, fragments, vectorOffset++);
			}
		}

		t.end(null);

		if (fragments.size() == 0) {
			timer.end(null);
			return false;
		}

		t = new Timer(timer, "checkValue");

		Fragment fragment = fragments.first();
		while (fragment != null) {
			currentVector = checkValue(currentVector, vectorIterator,
					startOffset, fragment);
			startOffset += fragment.getOriginalText().length();
			fragment = fragment.next();
		}

		t.end(null);

		Timer sbTimer = new Timer(timer, "snippetBuilder");

		boolean result = false;
		int snippetCounter = maxSnippetNumber;
		int scoredFragment = 0;
		while (snippetCounter-- != 0) {
			Fragment bestScoreFragment = null;
			fragment = Fragment.findNextHighlightedFragment(fragments.first());
			List scoreFragments = new ArrayList(0);
			double maxSearchScore = 0;

			t = new Timer(sbTimer, "fragmentScore");
			boolean expired = false;

			while (fragment != null) {
				double sc = fragment.searchScore(name, analyzer, query);
				if (sc > maxSearchScore)
					maxSearchScore = sc;
				scoreFragments.add(fragment);
				fragment = Fragment
						.findNextHighlightedFragment(fragment.next());
				scoredFragment++;
				if (expiration != 0) {
					if (System.currentTimeMillis() > expiration) {
						expired = true;
						break;
					}
				}
			}

			t.end("fragmentScore " + scoredFragment + " " + expired);

			for (Fragment frag : scoreFragments)
				bestScoreFragment = Fragment.bestScore(bestScoreFragment, frag,
						maxSearchScore, maxSnippetSize);

			if (bestScoreFragment != null) {
				SnippetBuilder snippetBuilder = new SnippetBuilder(
						maxSnippetSize, unescapedSeparator, tags,
						bestScoreFragment);
				if (snippetBuilder.length() > 0)
					snippets.add(new FieldValueItem(
							FieldValueOriginEnum.SNIPPET, snippetBuilder
									.toString()));
				fragments.remove(snippetBuilder.getFragments());
				result = true;
				continue;
			}

			if (fragments.first() == null)
				break;
			SnippetBuilder snippetBuilder = new SnippetBuilder(maxSnippetSize,
					unescapedSeparator, tags, fragments.first());
			if (snippetBuilder.length() > 0) {
				snippets.add(new FieldValueItem(FieldValueOriginEnum.SNIPPET,
						snippetBuilder.toString()));
				fragments.remove(snippetBuilder.getFragments());
			}
		}

		sbTimer.end(null);

		timer.end(null);

		return result;
	}

	@Override
	public void writeXmlConfig(XmlWriter xmlWriter) throws SAXException {
		xmlWriter.startElement("field", "name", name, "tag", tag, "separator",
				separator, "maxSnippetSize", Integer.toString(maxSnippetSize),
				"maxSnippetNumber", Integer.toString(maxSnippetNumber),
				"fragmenterClass",
				fragmenterTemplate != null ? fragmenterTemplate.getClass()
						.getSimpleName() : null, "timeLimit", Long
						.toString(timeLimit));
		xmlWriter.endElement();
	}

	@Override
	public int compareTo(SnippetField f) {
		int c = super.compareTo(f);
		if (c != 0)
			return c;
		if ((c = fragmenterTemplate.getClass().getName()
				.compareTo(f.fragmenterTemplate.getClass().getName())) != 0)
			return c;
		if ((c = tag.compareTo(f.tag)) != 0)
			return c;
		if ((c = separator.compareTo(f.separator)) != 0)
			return c;
		if ((c = maxSnippetSize - f.maxSnippetSize) != 0)
			return c;
		if ((c = maxSnippetNumber - f.maxSnippetNumber) != 0)
			return c;
		return 0;
	}

	/**
	 * @return the timeLimit
	 */
	public int getTimeLimit() {
		return timeLimit;
	}

	/**
	 * @param timeLimit
	 *            the timeLimit to set
	 */
	public void setTimeLimit(int timeLimit) {
		this.timeLimit = timeLimit;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy