com.opensearchserver.textextractor.ParserAbstract Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2014 OpenSearchServer Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.opensearchserver.textextractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import javax.ws.rs.core.MultivaluedMap;
import org.apache.commons.io.IOUtils;
import com.opensearchserver.textextractor.util.Language;
public abstract class ParserAbstract {
protected final ParserDocument metas;
private final List documents;
protected MultivaluedMap parameters;
protected ParserAbstract() {
documents = new ArrayList(0);
metas = new ParserDocument();
parameters = null;
}
protected ParserDocument getNewParserDocument() {
ParserDocument document = new ParserDocument();
documents.add(document);
return document;
}
protected String getParameterValue(ParserField param, int position) {
if (parameters == null)
return null;
List values = parameters.get(param.name);
if (values == null)
return null;
if (position >= values.size())
return null;
return values.get(position);
}
/**
* The parameters of the parser
*
* @return
*/
protected abstract ParserField[] getParameters();
/**
* The fields returned by this parser
*
* @return
*/
protected abstract ParserField[] getFields();
/**
* @throws Exception
* Read a document and fill the ParserDocument list.
*
* @param inputStream
* @throws IOException
* @throws
*/
protected abstract void parseContent(InputStream inputStream)
throws Exception;
/**
* Read a document and fill the ParserDocument list.
*
* @param file
* @throws IOException
*/
protected void parseContent(File file) throws Exception {
InputStream is = null;
try {
is = new FileInputStream(file);
parseContent(is);
} finally {
if (is != null)
IOUtils.closeQuietly(is);
}
}
protected final static File createTempFile(InputStream inputStream,
String extension) throws IOException {
File tempFile = File.createTempFile("oss-text-extractor", extension);
FileOutputStream fos = null;
try {
fos = new FileOutputStream(tempFile);
IOUtils.copy(inputStream, fos);
fos.close();
fos = null;
return tempFile;
} finally {
if (fos != null)
IOUtils.closeQuietly(fos);
}
}
public final ParserResult doParsing(
MultivaluedMap parameters, InputStream inputStream)
throws Exception {
this.parameters = parameters;
ParserResult result = new ParserResult();
parseContent(inputStream);
result.done(metas, documents);
return result;
}
public final ParserResult doParsing(
MultivaluedMap parameters, File file)
throws Exception {
this.parameters = parameters;
ParserResult result = new ParserResult();
parseContent(file);
result.done(metas, documents);
return result;
}
/**
* Submit the content of a field to language detection. It checks all the
* document.
*
* @param source
* The field to submit
* @param maxLength
* The maximum number of characters
* @return
*/
protected final String languageDetection(ParserField source, int maxLength) {
StringBuilder sb = new StringBuilder();
for (ParserDocument document : documents) {
List