All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.opensearchserver.textextractor.ParserAbstract Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2014 OpenSearchServer Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.opensearchserver.textextractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import javax.ws.rs.core.MultivaluedMap;

import org.apache.commons.io.IOUtils;

import com.opensearchserver.textextractor.util.Language;

public abstract class ParserAbstract {

	protected final ParserDocument metas;
	private final List documents;
	protected MultivaluedMap parameters;

	protected ParserAbstract() {
		documents = new ArrayList(0);
		metas = new ParserDocument();
		parameters = null;
	}

	protected ParserDocument getNewParserDocument() {
		ParserDocument document = new ParserDocument();
		documents.add(document);
		return document;
	}

	protected String getParameterValue(ParserField param, int position) {
		if (parameters == null)
			return null;
		List values = parameters.get(param.name);
		if (values == null)
			return null;
		if (position >= values.size())
			return null;
		return values.get(position);
	}

	/**
	 * The parameters of the parser
	 * 
	 * @return
	 */
	protected abstract ParserField[] getParameters();

	/**
	 * The fields returned by this parser
	 * 
	 * @return
	 */
	protected abstract ParserField[] getFields();

	/**
	 * @throws Exception
	 *             Read a document and fill the ParserDocument list.
	 * 
	 * @param inputStream
	 * @throws IOException
	 * @throws
	 */
	protected abstract void parseContent(InputStream inputStream)
			throws Exception;

	/**
	 * Read a document and fill the ParserDocument list.
	 * 
	 * @param file
	 * @throws IOException
	 */
	protected void parseContent(File file) throws Exception {
		InputStream is = null;
		try {
			is = new FileInputStream(file);
			parseContent(is);
		} finally {
			if (is != null)
				IOUtils.closeQuietly(is);
		}
	}

	protected final static File createTempFile(InputStream inputStream,
			String extension) throws IOException {
		File tempFile = File.createTempFile("oss-text-extractor", extension);
		FileOutputStream fos = null;
		try {
			fos = new FileOutputStream(tempFile);
			IOUtils.copy(inputStream, fos);
			fos.close();
			fos = null;
			return tempFile;
		} finally {
			if (fos != null)
				IOUtils.closeQuietly(fos);
		}
	}

	public final ParserResult doParsing(
			MultivaluedMap parameters, InputStream inputStream)
			throws Exception {
		this.parameters = parameters;
		ParserResult result = new ParserResult();
		parseContent(inputStream);
		result.done(metas, documents);
		return result;
	}

	public final ParserResult doParsing(
			MultivaluedMap parameters, File file)
			throws Exception {
		this.parameters = parameters;
		ParserResult result = new ParserResult();
		parseContent(file);
		result.done(metas, documents);
		return result;
	}

	/**
	 * Submit the content of a field to language detection. It checks all the
	 * document.
	 * 
	 * @param source
	 *            The field to submit
	 * @param maxLength
	 *            The maximum number of characters
	 * @return
	 */
	protected final String languageDetection(ParserField source, int maxLength) {
		StringBuilder sb = new StringBuilder();
		for (ParserDocument document : documents) {
			List objectList = document.fields.get(source.name);
			if (objectList == null)
				continue;
			for (Object object : objectList) {
				if (object == null)
					continue;
				sb.append(object.toString());
				sb.append(' ');
				if (sb.length() > maxLength)
					Language.quietDetect(sb.toString(), maxLength);
			}
		}
		return Language.quietDetect(sb.toString(), maxLength);
	}

	/**
	 * Submit the content if of a field to language detection.
	 * 
	 * @param document
	 * @param source
	 * @param maxLength
	 * @return
	 */
	protected final String languageDetection(ParserDocument document,
			ParserField source, int maxLength) {
		StringBuilder sb = new StringBuilder();
		List objectList = document.fields.get(source.name);
		if (objectList == null)
			return null;
		for (Object object : objectList) {
			if (object == null)
				continue;
			sb.append(object.toString());
			sb.append(' ');
			if (sb.length() > maxLength)
				Language.quietDetect(sb.toString(), maxLength);
		}
		return Language.quietDetect(sb.toString(), maxLength);
	}

}