All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.qwazr.extractor.parser.TextParser Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Copyright 2015-2020 Emmanuel Keller / QWAZR
 * 

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.qwazr.extractor.parser; import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import com.qwazr.extractor.ParserFactory; import com.qwazr.extractor.ParserField; import com.qwazr.extractor.ParserInterface; import com.qwazr.extractor.ParserResult; import static com.qwazr.extractor.ParserUtils.languageDetection; import static com.qwazr.extractor.ParserUtils.toBufferedStream; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; import java.nio.file.Path; import java.util.Collection; import java.util.List; import javax.ws.rs.core.MediaType; import javax.ws.rs.core.MultivaluedMap; import org.apache.commons.io.IOUtils; public class TextParser implements ParserFactory, ParserInterface { private static final String NAME = "text"; private static final List DEFAULT_MIMETYPES = List.of(MediaType.TEXT_PLAIN_TYPE); private static final List DEFAULT_EXTENSIONS = List.of("txt"); final private static ParserField CHARSET_DETECTION = ParserField.newString("charset_detection", "Detection of the charset"); final private static Collection FIELDS = List.of(CONTENT, LANG_DETECTION, CHARSET_DETECTION); @Override public Collection getFields() { return FIELDS; } @Override public Collection getSupportedFileExtensions() { return DEFAULT_EXTENSIONS; } @Override public String getName() { return NAME; } @Override public ParserInterface createParser() { return this; } @Override public Collection getSupportedMimeTypes() { return DEFAULT_MIMETYPES; } @Override public ParserResult extract(final MultivaluedMap parameters, final InputStream inputStream, final MediaType mimeType) throws IOException { final ParserResult.Builder builder = ParserResult.of(NAME); builder.metas().set(MIME_TYPE, mimeType.toString()); // Trying to detect the CHARSET of the stream final CharsetDetector detector = new CharsetDetector(); try (final BufferedInputStream bis = new BufferedInputStream(inputStream)) { detector.setText(bis); final CharsetMatch match = detector.detect(); final ParserResult.FieldsBuilder result = builder.newDocument(); final String content; if (match != null) { content = match.getString(); result.add(CHARSET_DETECTION, match.getName()); } else { bis.reset(); content = IOUtils.toString(bis, Charset.defaultCharset()); } result.add(CONTENT, content); result.add(LANG_DETECTION, languageDetection(result, CONTENT, 10000)); } return builder.build(); } @Override public ParserResult extract(final MultivaluedMap parameters, final Path filePath) throws IOException { return toBufferedStream(filePath, input -> extract(parameters, input, MediaType.TEXT_PLAIN_TYPE)); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy