All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
package org.apache.tika.batch.fs;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;

import org.apache.commons.io.IOUtils;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.helpers.DefaultHandler;

/**
 * Basic FileResourceConsumer that reads files from an input
 * directory and writes content to the output directory.
 * 

* This tries to catch most of the common exceptions, log them and * store them in the metadata list output. */ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer { private final ParserFactory parserFactory; private final ContentHandlerFactory contentHandlerFactory; private final OutputStreamFactory fsOSFactory; private final TikaConfig tikaConfig; private String outputEncoding = "UTF-8"; public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue queue, ParserFactory parserFactory, ContentHandlerFactory contentHandlerFactory, OutputStreamFactory fsOSFactory, TikaConfig tikaConfig) { super(queue); this.parserFactory = parserFactory; this.contentHandlerFactory = contentHandlerFactory; this.fsOSFactory = fsOSFactory; this.tikaConfig = tikaConfig; } @Override public boolean processFileResource(FileResource fileResource) { Parser wrapped = parserFactory.getParser(tikaConfig); RecursiveParserWrapper parser = new RecursiveParserWrapper(wrapped, contentHandlerFactory); ParseContext context = new ParseContext(); // if (parseRecursively == true) { context.set(Parser.class, parser); // } //try to open outputstream first OutputStream os = getOutputStream(fsOSFactory, fileResource); if (os == null) { LOG.debug("Skipping: {}", fileResource.getMetadata().get(FSProperties.FS_REL_PATH)); return false; } //try to open the inputstream before the parse. //if the parse hangs or throws a nasty exception, at least there will //be a zero byte file there so that the batchrunner can skip that problematic //file during the next run. InputStream is = getInputStream(fileResource); if (is == null) { IOUtils.closeQuietly(os); return false; } Throwable thrown = null; List metadataList = null; Metadata containerMetadata = fileResource.getMetadata(); try { parse(fileResource.getResourceId(), parser, is, new DefaultHandler(), containerMetadata, context); metadataList = parser.getMetadata(); } catch (Throwable t) { thrown = t; metadataList = parser.getMetadata(); if (metadataList == null) { metadataList = new LinkedList<>(); } Metadata m = null; if (metadataList.size() == 0) { m = containerMetadata; } else { //take the top metadata item m = metadataList.remove(0); } String stackTrace = ExceptionUtils.getFilteredStackTrace(t); m.add(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX+"runtime", stackTrace); metadataList.add(0, m); } finally { IOUtils.closeQuietly(is); } Writer writer = null; try { writer = new OutputStreamWriter(os, getOutputEncoding()); JsonMetadataList.toJson(metadataList, writer); } catch (Exception e) { //this is a stop the world kind of thing LOG.error("{}", getXMLifiedLogMsg(IO_OS + "json", fileResource.getResourceId(), e)); throw new RuntimeException(e); } finally { flushAndClose(writer); } if (thrown != null) { if (thrown instanceof Error) { throw (Error) thrown; } else { return false; } } return true; } public String getOutputEncoding() { return outputEncoding; } public void setOutputEncoding(String outputEncoding) { this.outputEncoding = outputEncoding; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy