All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.parser.HtmlParser Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2008-2015 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.parser;

import java.io.IOException;
import java.io.StringWriter;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPathExpressionException;

import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.util.Version;
import org.xml.sax.SAXException;

import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.ClassPropertyEnum;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.crawler.web.database.UrlFilterItem;
import com.jaeksoft.searchlib.crawler.web.database.UrlItemFieldEnum;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlDocumentProvider;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlNodeAbstract;
import com.jaeksoft.searchlib.parser.htmlParser.HtmlParserEnum;
import com.jaeksoft.searchlib.schema.FieldValueItem;
import com.jaeksoft.searchlib.streamlimiter.LimitException;
import com.jaeksoft.searchlib.streamlimiter.StreamLimiter;
import com.jaeksoft.searchlib.util.IOUtils;
import com.jaeksoft.searchlib.util.Lang;
import com.jaeksoft.searchlib.util.LinkUtils;
import com.jaeksoft.searchlib.util.StringUtils;

public class HtmlParser extends Parser {

	public static final String[] DEFAULT_MIMETYPES = { "text/html", "application/xhtml+xml" };

	public static final String[] DEFAULT_EXTENSIONS = { "html", "xhtml" };

	private final static TreeSet sentenceTagSet = new TreeSet();

	private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name, ParserFieldEnum.title,
			ParserFieldEnum.generated_title, ParserFieldEnum.body, ParserFieldEnum.meta_keywords,
			ParserFieldEnum.meta_description, ParserFieldEnum.meta_robots, ParserFieldEnum.internal_link,
			ParserFieldEnum.internal_link_nofollow, ParserFieldEnum.external_link,
			ParserFieldEnum.external_link_nofollow, ParserFieldEnum.lang, ParserFieldEnum.htmlProvider,
			ParserFieldEnum.htmlSource };

	private class BoostTag {
		private final Float boost;
		private String firstContent;

		private BoostTag(ClassPropertyEnum classPropertyEnum) {
			this.boost = getFloatProperty(classPropertyEnum);
			this.firstContent = null;
		}
	}

	private Map boostTagMap;

	private Float titleBoost;
	private boolean ignoreMetaNoIndex;
	private boolean ignoreMetaNoFollow;
	private boolean ignoreLinkNoFollow;
	private boolean ignoreUntitledDocuments;
	private boolean ignoreNonCanonical;
	private boolean isCanonical = true;

	public HtmlParser() {
		super(fl);
		synchronized (this) {
			if (sentenceTagSet.size() == 0) {
				sentenceTagSet.add("p");
				sentenceTagSet.add("td");
				sentenceTagSet.add("div");
				sentenceTagSet.add("h1");
				sentenceTagSet.add("h2");
				sentenceTagSet.add("h3");
				sentenceTagSet.add("h4");
				sentenceTagSet.add("h5");
				sentenceTagSet.add("h6");
				sentenceTagSet.add("hr");
				sentenceTagSet.add("li");
				sentenceTagSet.add("option");
				sentenceTagSet.add("pre");
				sentenceTagSet.add("select");
				sentenceTagSet.add("table");
				sentenceTagSet.add("tbody");
				sentenceTagSet.add("td");
				sentenceTagSet.add("textarea");
				sentenceTagSet.add("tfoot");
				sentenceTagSet.add("thead");
				sentenceTagSet.add("th");
				sentenceTagSet.add("title");
				sentenceTagSet.add("tr");
				sentenceTagSet.add("ul");
			}
		}
	}

	@Override
	public void initProperties() throws SearchLibException {
		super.initProperties();
		addProperty(ClassPropertyEnum.SIZE_LIMIT, "0", null, 20, 1);
		addProperty(ClassPropertyEnum.DEFAULT_CHARSET, "UTF-8", null, 20, 1);
		addProperty(ClassPropertyEnum.HTML_PARSER, HtmlParserEnum.BestScoreParser.getLabel(),
				HtmlParserEnum.getLabelArray(), 0, 0);
		addProperty(ClassPropertyEnum.URL_FRAGMENT, ClassPropertyEnum.KEEP_REMOVE_LIST[0],
				ClassPropertyEnum.KEEP_REMOVE_LIST, 0, 0);
		addProperty(ClassPropertyEnum.IGNORE_META_NOINDEX, Boolean.FALSE.toString(), ClassPropertyEnum.BOOLEAN_LIST, 0,
				0);
		addProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW, Boolean.FALSE.toString(), ClassPropertyEnum.BOOLEAN_LIST, 0,
				0);
		addProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW, Boolean.FALSE.toString(), ClassPropertyEnum.BOOLEAN_LIST, 0,
				0);
		addProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS, Boolean.FALSE.toString(),
				ClassPropertyEnum.BOOLEAN_LIST, 0, 0);
		addProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL, Boolean.TRUE.toString(), ClassPropertyEnum.BOOLEAN_LIST, 0,
				0);
		addProperty(ClassPropertyEnum.TITLE_BOOST, "2", null, 10, 1);
		addProperty(ClassPropertyEnum.H1_BOOST, "1.8", null, 10, 1);
		addProperty(ClassPropertyEnum.H2_BOOST, "1.6", null, 10, 1);
		addProperty(ClassPropertyEnum.H3_BOOST, "1.4", null, 10, 1);
		addProperty(ClassPropertyEnum.H4_BOOST, "1.2", null, 10, 1);
		addProperty(ClassPropertyEnum.H5_BOOST, "1.1", null, 10, 1);
		addProperty(ClassPropertyEnum.H6_BOOST, "1.1", null, 10, 1);
		addProperty(ClassPropertyEnum.XPATH_EXCLUSION, "", null, 50, 5);
	}

	private final static String OPENSEARCHSERVER_FIELD = "opensearchserver.field.";
	private final static String OPENSEARCHSERVER_IGNORE = "opensearchserver.ignore";
	private final static int OPENSEARCHSERVER_FIELD_LENGTH = OPENSEARCHSERVER_FIELD.length();

	private void getBodyTextContent(ParserResultItem result, StringBuilder sb, HtmlNodeAbstract node,
			boolean bAddBlock, String[] directFields, int recursion, Set nodeExclusionsSet) {
		if (recursion == 0) {
			Logging.warn("Max recursion reached (getBodyTextContent)");
			return;
		}
		if (nodeExclusionsSet != null)
			if (nodeExclusionsSet.contains(node.node))
				return;

		recursion--;
		if (node.isComment())
			return;
		String nodeName = node.getNodeName();
		if ("script".equalsIgnoreCase(nodeName))
			return;
		if ("style".equalsIgnoreCase(nodeName))
			return;
		if ("object".equalsIgnoreCase(nodeName))
			return;
		if ("title".equalsIgnoreCase(nodeName))
			return;
		if ("oss".equalsIgnoreCase(nodeName)) {
			if ("yes".equalsIgnoreCase(node.getAttribute("ignore")))
				return;
		}

		boolean bEnterDirectField = false;
		String classNameAttribute = node.getAttribute("class");
		if (classNameAttribute != null) {
			String[] classNames = org.apache.commons.lang.StringUtils.split(classNameAttribute);
			if (classNames != null) {
				for (String className : classNames) {
					if (OPENSEARCHSERVER_IGNORE.equalsIgnoreCase(className))
						return;
					if (className.startsWith(OPENSEARCHSERVER_FIELD)) {
						String directField = classNameAttribute.substring(OPENSEARCHSERVER_FIELD_LENGTH);
						if (directField.length() > 0) {
							directFields = directField.split("\\.");
							bEnterDirectField = directFields.length > 0;
						}
					}
				}
			}
		}

		if (node.isTextNode()) {
			String text = node.getText();
			text = text.replaceAll("\\r", " ");
			text = text.replaceAll("\\n", " ");
			text = StringUtils.replaceConsecutiveSpaces(text, " ");
			text = text.trim();
			if (text.length() > 0) {
				text = StringEscapeUtils.unescapeHtml4(text);
				if (sb.length() > 0)
					sb.append(' ');
				sb.append(text);
			}
		}
		List> children = node.getChildNodes();
		if (children != null)
			for (HtmlNodeAbstract htmlNode : children)
				getBodyTextContent(result, sb, htmlNode, bAddBlock, directFields, recursion, nodeExclusionsSet);

		if (bAddBlock && nodeName != null && sb.length() > 0) {
			String currentTag = nodeName.toLowerCase();
			boolean bForSentence = sb.charAt(sb.length() - 1) != '.' && sentenceTagSet.contains(currentTag);
			if (bForSentence || bEnterDirectField) {
				if (directFields != null)
					result.addDirectFields(directFields, sb.toString());
				else
					addFieldBody(result, currentTag, sb.toString());
				sb.setLength(0);
			}
		}
	}

	protected void addFieldTitle(ParserResultItem result, String value) {
		result.addField(ParserFieldEnum.title, value, titleBoost);
	}

	protected void addFieldBody(ParserResultItem result, String tag, String value) {
		BoostTag boostTag = boostTagMap.get(tag);
		Float boost = null;
		if (boostTag != null) {
			boost = boostTag.boost;
			if (boostTag.firstContent == null)
				boostTag.firstContent = value;
		}
		if (boost == null)
			boost = 1.0F;
		result.addField(ParserFieldEnum.body, value, boost);
	}

	private final static String selectCharset(String... charsets) {
		if (charsets.length == 0)
			return null;
		String first = null;
		int position = 0;
		int selected = 0;
		for (String charset : charsets) {
			position++;
			if (charset == null)
				continue;
			if (first == null) {
				first = charset;
				selected = position;
				continue;
			}
			if (!first.equals(charset))
				break;
		}
		if (Logging.isDebug)
			Logging.debug("SelectedCharset : " + first + " (" + selected + '/' + position + ')');
		return first;
	}

	private final HtmlDocumentProvider getHtmlDocumentProvider(HtmlParserEnum htmlParserEnum, String charset,
			StreamLimiter streamLimiter, String xPathExclusions, Set xPathExclusionSet)
					throws LimitException, IOException, SearchLibException {

		HtmlDocumentProvider htmlProvider;
		try {
			htmlProvider = htmlParserEnum.getHtmlParser(charset, streamLimiter, xPathExclusionSet != null);
		} catch (InstantiationException e) {
			throw new SearchLibException(e);
		} catch (IllegalAccessException e) {
			throw new SearchLibException(e);
		} catch (SAXException e) {
			throw new SearchLibException(e);
		} catch (ParserConfigurationException e) {
			throw new SearchLibException(e);
		}
		if (htmlProvider == null)
			return null;
		if (xPathExclusionSet != null) {
			String[] xPathLines = StringUtils.splitLines(xPathExclusions);
			try {
				for (String xPath : xPathLines)
					if (!StringUtils.isBlank(xPath))
						htmlProvider.xPath(xPath, xPathExclusionSet);
			} catch (XPathExpressionException e) {
				throw new SearchLibException(e);
			}
		}
		return htmlProvider;
	}

	@Override
	protected void parseContent(StreamLimiter streamLimiter, LanguageEnum forcedLang)
			throws IOException, SearchLibException {

		titleBoost = getFloatProperty(ClassPropertyEnum.TITLE_BOOST);
		boostTagMap = new TreeMap();
		boostTagMap.put("h1", new BoostTag(ClassPropertyEnum.H1_BOOST));
		boostTagMap.put("h2", new BoostTag(ClassPropertyEnum.H2_BOOST));
		boostTagMap.put("h3", new BoostTag(ClassPropertyEnum.H3_BOOST));
		boostTagMap.put("h4", new BoostTag(ClassPropertyEnum.H4_BOOST));
		boostTagMap.put("h5", new BoostTag(ClassPropertyEnum.H5_BOOST));
		boostTagMap.put("h6", new BoostTag(ClassPropertyEnum.H6_BOOST));
		ignoreMetaNoIndex = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOINDEX);
		ignoreMetaNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_META_NOFOLLOW);
		ignoreLinkNoFollow = getBooleanProperty(ClassPropertyEnum.IGNORE_LINK_NOFOLLOW);
		ignoreUntitledDocuments = getBooleanProperty(ClassPropertyEnum.IGNORE_UNTITLED_DOCUMENTS);
		ignoreNonCanonical = getBooleanProperty(ClassPropertyEnum.IGNORE_NON_CANONICAL);

		String currentCharset = null;
		String headerCharset = null;
		String detectedCharset = null;

		IndexDocument sourceDocument = getSourceDocument();

		if (sourceDocument != null) {
			FieldValueItem fieldValueItem = sourceDocument
					.getFieldValue(UrlItemFieldEnum.INSTANCE.contentTypeCharset.getName(), 0);
			if (fieldValueItem != null)
				headerCharset = fieldValueItem.getValue();
			if (headerCharset == null) {
				fieldValueItem = sourceDocument.getFieldValue(UrlItemFieldEnum.INSTANCE.contentEncoding.getName(), 0);
				if (fieldValueItem != null)
					headerCharset = fieldValueItem.getValue();
			}
			currentCharset = headerCharset;
		}

		if (currentCharset == null) {
			detectedCharset = streamLimiter.getDetectedCharset();
			currentCharset = detectedCharset;
		}

		if (currentCharset == null) {
			currentCharset = getProperty(ClassPropertyEnum.DEFAULT_CHARSET).getValue();
		}

		String xPathExclusions = getProperty(ClassPropertyEnum.XPATH_EXCLUSION).getValue();
		Set xPathExclusionsSet = null;
		if (!StringUtils.isEmpty(xPathExclusions))
			xPathExclusionsSet = new HashSet();

		HtmlParserEnum htmlParserEnum = HtmlParserEnum.find(getProperty(ClassPropertyEnum.HTML_PARSER).getValue());

		HtmlDocumentProvider htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter,
				xPathExclusions, xPathExclusionsSet);
		if (htmlProvider == null)
			return;

		URL currentURL = htmlProvider.getBaseHref();
		IndexDocument srcDoc = getSourceDocument();
		String streamOriginalUrl = streamLimiter.getOriginURL();
		try {
			if (currentURL == null && !StringUtils.isEmpty(streamOriginalUrl))
				currentURL = LinkUtils.newEncodedURL(streamOriginalUrl);
			if (currentURL == null && srcDoc != null) {
				FieldValueItem fvi = srcDoc.getFieldValue(UrlItemFieldEnum.INSTANCE.url.getName(), 0);
				if (fvi != null)
					currentURL = LinkUtils.newEncodedURL(fvi.getValue());
			}
		} catch (URISyntaxException e) {
			throw new IOException(e);
		}

		URL canonicalURL = htmlProvider.getCanonicalLink(currentURL);
		if (canonicalURL != null) {
			String canUrl = canonicalURL.toExternalForm();
			addDetectedLink(canUrl);
			if (ignoreNonCanonical) {
				String curUrl = currentURL.toExternalForm();
				if (!canUrl.equals(curUrl)) {
					isCanonical = false;
					return;
				}
			}
		}
		isCanonical = true;

		String title = htmlProvider.getTitle();
		if (ignoreUntitledDocuments)
			if (title == null || title.length() == 0)
				return;

		ParserResultItem result = getNewParserResultItem();

		addFieldTitle(result, title);

		result.addField(ParserFieldEnum.htmlProvider, htmlProvider.getName());

		// Check ContentType charset in meta http-equiv
		String metaCharset = htmlProvider.getMetaCharset();

		String selectedCharset = selectCharset(headerCharset, metaCharset, detectedCharset);

		if (selectedCharset != null) {
			if (!selectedCharset.equals(currentCharset)) {
				currentCharset = selectedCharset;
				htmlProvider = getHtmlDocumentProvider(htmlParserEnum, currentCharset, streamLimiter, xPathExclusions,
						xPathExclusionsSet);
			}
		}

		StringWriter writer = new StringWriter();
		IOUtils.copy(streamLimiter.getNewInputStream(), writer, currentCharset);
		result.addField(ParserFieldEnum.htmlSource, writer.toString());
		writer.close();

		HtmlNodeAbstract rootNode = htmlProvider.getRootNode();
		if (rootNode == null)
			return;

		for (HtmlNodeAbstract metaNode : htmlProvider.getMetas()) {
			String metaName = metaNode.getAttributeText("name");
			if (metaName != null && metaName.startsWith(OPENSEARCHSERVER_FIELD)) {
				String field = metaName.substring(OPENSEARCHSERVER_FIELD_LENGTH);
				String[] fields = field.split("\\.");
				if (fields != null) {
					String content = metaNode.getAttributeText("content");
					result.addDirectFields(fields, content);
				}
			}
		}

		result.addField(ParserFieldEnum.charset, currentCharset);

		String metaRobots = null;

		String metaDcLanguage = null;

		String metaContentLanguage = null;

		for (HtmlNodeAbstract node : htmlProvider.getMetas()) {
			String attr_name = node.getAttributeText("name");
			String attr_http_equiv = node.getAttributeText("http-equiv");
			if ("keywords".equalsIgnoreCase(attr_name))
				result.addField(ParserFieldEnum.meta_keywords, HtmlDocumentProvider.getMetaContent(node));
			else if ("description".equalsIgnoreCase(attr_name))
				result.addField(ParserFieldEnum.meta_description, HtmlDocumentProvider.getMetaContent(node));
			else if ("robots".equalsIgnoreCase(attr_name))
				metaRobots = HtmlDocumentProvider.getMetaContent(node);
			else if ("dc.language".equalsIgnoreCase(attr_name))
				metaDcLanguage = HtmlDocumentProvider.getMetaContent(node);
			else if ("content-language".equalsIgnoreCase(attr_http_equiv))
				metaContentLanguage = HtmlDocumentProvider.getMetaContent(node);
		}

		boolean metaRobotsFollow = true;
		boolean metaRobotsNoIndex = false;
		if (metaRobots != null) {
			metaRobots = metaRobots.toLowerCase();
			if (metaRobots.contains("noindex") && !ignoreMetaNoIndex) {
				metaRobotsNoIndex = true;
				result.addField(ParserFieldEnum.meta_robots, "noindex");
			}
			if (metaRobots.contains("nofollow") && !ignoreMetaNoFollow) {
				metaRobotsFollow = false;
				result.addField(ParserFieldEnum.meta_robots, "nofollow");
			}
		}

		UrlFilterItem[] urlFilterList = getUrlFilterList();

		boolean removeFragment = ClassPropertyEnum.KEEP_REMOVE_LIST[1]
				.equalsIgnoreCase(getProperty(ClassPropertyEnum.URL_FRAGMENT).getValue());

		List> nodes = rootNode.getAllNodes("a", "frame", "img");
		if (srcDoc != null && nodes != null && metaRobotsFollow) {
			for (HtmlNodeAbstract node : nodes) {
				String href = null;
				String rel = null;
				String nodeName = node.getNodeName();
				if ("a".equals(nodeName)) {
					href = node.getAttributeText("href");
					rel = node.getAttributeText("rel");
				} else if ("frame".equals(nodeName) || "img".equals(nodeName)) {
					href = node.getAttributeText("src");
				}
				boolean follow = true;
				if (rel != null)
					if (rel.contains("nofollow") && !ignoreLinkNoFollow)
						follow = false;
				URL newUrl = null;
				if (href != null)
					if (!href.startsWith("javascript:"))
						if (currentURL != null) {
							href = StringEscapeUtils.unescapeXml(href);
							newUrl = LinkUtils.getLink(currentURL, href, urlFilterList, removeFragment);
						}
				if (newUrl != null) {
					ParserFieldEnum field = null;
					if (newUrl.getHost().equalsIgnoreCase(currentURL.getHost())) {
						if (follow)
							field = ParserFieldEnum.internal_link;
						else
							field = ParserFieldEnum.internal_link_nofollow;
					} else {
						if (follow)
							field = ParserFieldEnum.external_link;
						else
							field = ParserFieldEnum.external_link_nofollow;
					}
					String link = newUrl.toExternalForm();
					result.addField(field, link);
					if (follow)
						addDetectedLink(link);
				}
			}
		}

		if (!metaRobotsNoIndex) {
			nodes = rootNode.getNodes("html", "body");
			if (nodes == null || nodes.size() == 0)
				nodes = rootNode.getNodes("html");
			if (nodes != null && nodes.size() > 0) {
				StringBuilder sb = new StringBuilder();
				getBodyTextContent(result, sb, nodes.get(0), true, null, 1024, xPathExclusionsSet);
				result.addField(ParserFieldEnum.body, sb);
			}
		}

		// Identification de la langue:
		Locale lang = null;
		String langMethod = null;
		String[] pathHtml = { "html" };
		nodes = rootNode.getNodes(pathHtml);
		if (nodes != null && nodes.size() > 0) {
			langMethod = "html lang attribute";
			String l = nodes.get(0).getAttributeText("lang");
			if (l != null)
				lang = Lang.findLocaleISO639(l);
		}
		if (lang == null && metaContentLanguage != null) {
			langMethod = "meta http-equiv content-language";
			lang = Lang.findLocaleISO639(metaContentLanguage);
		}
		if (lang == null && metaDcLanguage != null) {
			langMethod = "meta dc.language";
			lang = Lang.findLocaleISO639(metaDcLanguage);
		}

		if (lang != null) {
			result.addField(ParserFieldEnum.lang, lang.getLanguage());
			result.addField(ParserFieldEnum.lang_method, langMethod);
		} else if (!metaRobotsNoIndex)
			lang = result.langDetection(10000, ParserFieldEnum.body);

		if (getFieldMap().isMapped(ParserFieldEnum.generated_title)) {

			StringBuilder sb = new StringBuilder();
			try {
				if (!StringUtils.isEmpty(streamOriginalUrl))
					sb.append(new URI(streamOriginalUrl).getHost());
			} catch (URISyntaxException e) {
				Logging.error(e);
			}

			String generatedTitle = null;
			for (Map.Entry entry : boostTagMap.entrySet()) {
				BoostTag boostTag = entry.getValue();
				if (boostTag.firstContent != null) {
					generatedTitle = boostTag.firstContent;
					break;
				}
			}

			if (generatedTitle == null) {
				final String FIELD_TITLE = "contents";

				MemoryIndex bodyMemoryIndex = new MemoryIndex();
				Analyzer bodyAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36);
				String bodyText = result.getMergedBodyText(100000, " ", ParserFieldEnum.body);
				bodyMemoryIndex.addField(FIELD_TITLE, bodyText, bodyAnalyzer);

				IndexSearcher indexSearcher = bodyMemoryIndex.createSearcher();
				IndexReader indexReader = indexSearcher.getIndexReader();
				MoreLikeThis mlt = new MoreLikeThis(indexReader);
				mlt.setAnalyzer(bodyAnalyzer);
				mlt.setFieldNames(new String[] { FIELD_TITLE });
				mlt.setMinWordLen(3);
				mlt.setMinTermFreq(1);
				mlt.setMinDocFreq(1);

				String[] words = mlt.retrieveInterestingTerms(0);
				if (words != null && words.length > 0)
					generatedTitle = words[0];
			}

			if (generatedTitle != null) {
				if (sb.length() > 0)
					sb.append(" - ");
				sb.append(generatedTitle);
			}

			if (sb.length() > 67) {
				int pos = sb.indexOf(" ", 60);
				if (pos == -1)
					pos = 67;
				sb.setLength(pos);
				sb.append("...");
			}
			result.addField(ParserFieldEnum.generated_title, sb.toString());
		}

	}

	/**
	 * @return the isCanonical
	 */
	public boolean isCanonical() {
		return isCanonical;
	}

}