All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.batch.fs.BasicTikaFSConsumer Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
package org.apache.tika.batch.fs;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.commons.io.IOUtils;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.ContentHandlerFactory;
import org.xml.sax.ContentHandler;

/**
 * Basic FileResourceConsumer that reads files from an input
 * directory and writes content to the output directory.
 * 

* This catches all exceptions and errors and then logs them. * This will re-throw errors. * */ public class BasicTikaFSConsumer extends AbstractFSConsumer { private boolean parseRecursively = true; private final ParserFactory parserFactory; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; private final TikaConfig config; private String outputEncoding = UTF_8.toString(); public BasicTikaFSConsumer(ArrayBlockingQueue queue, ParserFactory parserFactory, ContentHandlerFactory contentHandlerFactory, OutputStreamFactory fsOSFactory, TikaConfig config) { super(queue); this.parserFactory = parserFactory; this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; this.config = config; } @Override public boolean processFileResource(FileResource fileResource) { Parser parser = parserFactory.getParser(config); ParseContext context = new ParseContext(); if (parseRecursively) { context.set(Parser.class, parser); } OutputStream os = getOutputStream(fsOSFactory, fileResource); //os can be null if fsOSFactory is set to skip processing a file if the output //file already exists if (os == null) { LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH)); return false; } InputStream is = getInputStream(fileResource); if (is == null) { IOUtils.closeQuietly(os); return false; } ContentHandler handler; try { handler = contentHandlerFactory.getNewContentHandler(os, getOutputEncoding()); } catch (UnsupportedEncodingException e) { incrementHandledExceptions(); LOG.error(getXMLifiedLogMsg("output_encoding_ex", fileResource.getResourceId(), e)); flushAndClose(os); throw new RuntimeException(e); } //now actually call parse! Throwable thrown = null; try { parse(fileResource.getResourceId(), parser, is, handler, fileResource.getMetadata(), context); } catch (Error t) { throw t; } catch (Throwable t) { thrown = t; } finally { flushAndClose(os); } if (thrown != null) { return false; } return true; } public String getOutputEncoding() { return outputEncoding; } public void setOutputEncoding(String outputEncoding) { this.outputEncoding = outputEncoding; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy