com.qwazr.extractor.parser.RtfParser Maven / Gradle / Ivy
/*
* Copyright 2015-2020 Emmanuel Keller / QWAZR
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.qwazr.extractor.parser;
import com.qwazr.extractor.ParserFactory;
import com.qwazr.extractor.ParserField;
import com.qwazr.extractor.ParserInterface;
import com.qwazr.extractor.ParserResult;
import com.qwazr.extractor.ParserUtils;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.Document;
import javax.swing.text.rtf.RTFEditorKit;
import javax.ws.rs.core.MediaType;
import javax.ws.rs.core.MultivaluedMap;
public class RtfParser implements ParserFactory, ParserInterface {
private final static String NAME = "rtf";
private static final Map EXT_TYPES = Map.of(
"rtf", MediaType.valueOf("application/rtf"),
"rtx", MediaType.valueOf("text/richtext"));
final private static List FIELDS = List.of(TITLE, CONTENT, LANG_DETECTION);
@Override
public Collection getParameters() {
return null;
}
@Override
public Collection getFields() {
return FIELDS;
}
@Override
public Collection getSupportedFileExtensions() {
return EXT_TYPES.keySet();
}
@Override
public String getName() {
return NAME;
}
@Override
public ParserInterface createParser() {
return this;
}
@Override
public Collection getSupportedMimeTypes() {
return EXT_TYPES.values();
}
private ParserResult extract(final InputStream inputStream, ParserResult.Builder builder) throws IOException {
try {
// Extract the text data
final RTFEditorKit rtf = new RTFEditorKit();
final Document doc = rtf.createDefaultDocument();
rtf.read(inputStream, doc, 0);
// Obtain a new parser document.
final ParserResult.FieldsBuilder result = builder.newDocument();
result.add(TITLE, doc.getProperty(Document.TitleProperty));
// Fill the field of the ParserDocument
result.add(CONTENT, doc.getText(0, doc.getLength()));
// Apply the language detection
result.add(LANG_DETECTION, ParserUtils.languageDetection(result, CONTENT, 10000));
return builder.build();
} catch (BadLocationException e) {
throw new IOException(e);
}
}
@Override
public ParserResult extract(final MultivaluedMap parameters,
final Path path) throws IOException {
final ParserResult.Builder builder = ParserResult.of(NAME);
builder.metas().set(MIME_TYPE, EXT_TYPES.get(ParserUtils.getExtension(path)));
return ParserUtils.toBufferedStream(path, input -> extract(input, builder));
}
@Override
public ParserResult extract(final MultivaluedMap parameters,
final InputStream inputStream,
final MediaType mimeType) throws IOException {
final ParserResult.Builder builder = ParserResult.of(NAME);
builder.metas().set(MIME_TYPE, mimeType.toString());
return extract(inputStream, builder);
}
}