uk.nhs.ciao.docs.parser.TikaDocumentParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ciao-docs-parser-core Show documentation
The newest version!
package uk.nhs.ciao.docs.parser;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import uk.nhs.ciao.docs.parser.extractor.PropertiesExtractor;
import uk.nhs.ciao.docs.parser.xml.SAXContentToDOMHandler;

import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

/**
 * A {@link DocumentParser} backed by Apache Tika.
 * 
 * The documents are first parsed by Tika (using the configured parser) and
 * converted to an XHTML DOM representation. Next a map of key/value properties
 * are extracted from the DOM and returned.
 * 

 * Whitespace text nodes are normalised in the intermediate document
 */
public class TikaDocumentParser implements DocumentParser {
	private final Parser parser;
	private final PropertiesExtractor propertiesExtractor;
	private final SAXContentToDOMHandler handler;
	
	/**
	 * Creates a new document parser backed by the specified Tika parser and
	 * properties extractor.
	 */
	public TikaDocumentParser(final Parser parser, final PropertiesExtractor propertiesExtractor)
			throws ParserConfigurationException {
		this.parser = Preconditions.checkNotNull(parser);
		this.propertiesExtractor = Preconditions.checkNotNull(propertiesExtractor);
		this.handler = createHandler();
	}
	
	/**
	 * {@inheritDoc}
	 * 

	 * First Tika is used to interpret the input stream, then the configured property
	 * extractor is used to find properties with the interpreted document text. Finally,
	 * the extracted properties are enriched by including additional metadata detected
	 * by Tika.
	 */
	@Override
	public Map parseDocument(final InputStream in)
			throws UnsupportedDocumentTypeException, IOException {
		final Document document = parseToDom(in);
		final Map properties = propertiesExtractor.extractProperties(document);		
		addTikaMetadataProperties(document, properties);		
		return properties;
	}

	/**
	 * Parses the input document via Tika, converting the output XHTML into a DOM
	 * representation
	 */
	private Document parseToDom(final InputStream in) throws IOException {
		try {
			final Metadata metadata = new Metadata();
			final ParseContext context = new ParseContext();
			parser.parse(in, handler, metadata, context);
			
			return handler.getDocument();
		} catch (SAXException e) {
			throw new IOException(e);
		} catch (TikaException e) {
			throw new IOException(e);
		} finally {
			handler.clear();
		}
	}
	
	/**
	 * Enriches the properties with additional metadata detected by Tika (e.g.
	 * the original document media type)
	 */
	@SuppressWarnings("unchecked")
	private void addTikaMetadataProperties(final Document document, final Map properties) {
		if (properties == null) {
			return;
		}
		
		final Map metadata = Maps.newLinkedHashMap();
		if (properties.containsKey(PropertyNames.METADATA)) {			
			metadata.putAll((Map)properties.get(PropertyNames.METADATA));
		}
		
		final NodeList nodes = document.getElementsByTagName("meta");
		for (int index = 0; index < nodes.getLength(); index++) {
			final Element element = (Element)nodes.item(index);
			final String name = element.getAttribute("name");
			final String value = element.getAttribute("content");
			
			if (!Strings.isNullOrEmpty(name) && !Strings.isNullOrEmpty(value)) {
				final Object previousValue = metadata.get(name);
				if (previousValue == null) {
					metadata.put(name, value);
				} else if (previousValue instanceof List) {
					((List