All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.tika.batch.fs.builders.BasicTikaFSConsumersBuilder Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.batch.fs.builders;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.apache.tika.batch.ConsumersManager;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.FileResourceConsumer;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.batch.builders.AbstractConsumersBuilder;
import org.apache.tika.batch.builders.BatchProcessBuilder;
import org.apache.tika.batch.builders.IContentHandlerFactoryBuilder;
import org.apache.tika.batch.builders.IParserFactoryBuilder;
import org.apache.tika.batch.fs.BasicTikaFSConsumer;
import org.apache.tika.batch.fs.FSConsumersManager;
import org.apache.tika.batch.fs.FSOutputStreamFactory;
import org.apache.tika.batch.fs.FSUtil;
import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
import org.apache.tika.batch.fs.StreamOutRPWFSConsumer;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.util.ClassLoaderUtil;
import org.apache.tika.util.PropsUtil;
import org.apache.tika.util.XMLDOMUtil;
public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
@Override
public ConsumersManager build(Node node, Map runtimeAttributes,
ArrayBlockingQueue queue) {
//figure out if we're building a recursiveParserWrapper
boolean recursiveParserWrapper = false;
String recursiveParserWrapperString = runtimeAttributes.get("recursiveParserWrapper");
if (recursiveParserWrapperString != null) {
recursiveParserWrapper =
PropsUtil.getBoolean(recursiveParserWrapperString, recursiveParserWrapper);
} else {
Node recursiveParserWrapperNode =
node.getAttributes().getNamedItem("recursiveParserWrapper");
if (recursiveParserWrapperNode != null) {
recursiveParserWrapper = PropsUtil
.getBoolean(recursiveParserWrapperNode.getNodeValue(),
recursiveParserWrapper);
}
}
boolean streamOut = false;
String streamOutString = runtimeAttributes.get("streamOut");
if (streamOutString != null) {
streamOut = PropsUtil.getBoolean(streamOutString, streamOut);
} else {
Node streamOutNode = node.getAttributes().getNamedItem("streamout");
if (streamOutNode != null) {
streamOut = PropsUtil.getBoolean(streamOutNode.getNodeValue(), streamOut);
}
}
//how long to let the consumersManager run on init() and shutdown()
Long consumersManagerMaxMillis = null;
String consumersManagerMaxMillisString = runtimeAttributes.get("consumersManagerMaxMillis");
if (consumersManagerMaxMillisString != null) {
consumersManagerMaxMillis = PropsUtil.getLong(consumersManagerMaxMillisString, null);
} else {
Node consumersManagerMaxMillisNode =
node.getAttributes().getNamedItem("consumersManagerMaxMillis");
if (consumersManagerMaxMillis == null && consumersManagerMaxMillisNode != null) {
consumersManagerMaxMillis =
PropsUtil.getLong(consumersManagerMaxMillisNode.getNodeValue(), null);
}
}
TikaConfig config = null;
String tikaConfigPath = runtimeAttributes.get("c");
if (tikaConfigPath == null) {
Node tikaConfigNode = node.getAttributes().getNamedItem("tikaConfig");
if (tikaConfigNode != null) {
tikaConfigPath = PropsUtil.getString(tikaConfigNode.getNodeValue(), null);
}
}
if (tikaConfigPath != null) {
try (InputStream is = Files.newInputStream(Paths.get(tikaConfigPath))) {
config = new TikaConfig(is);
} catch (Exception e) {
throw new RuntimeException(e);
}
} else {
config = TikaConfig.getDefaultConfig();
}
List consumers = new LinkedList<>();
int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
NodeList nodeList = node.getChildNodes();
Node contentHandlerFactoryNode = null;
Node parserFactoryNode = null;
Node outputStreamFactoryNode = null;
for (int i = 0; i < nodeList.getLength(); i++) {
Node child = nodeList.item(i);
String cn = child.getNodeName();
switch (cn) {
case "parser":
parserFactoryNode = child;
break;
case "contenthandler":
contentHandlerFactoryNode = child;
break;
case "outputstream":
outputStreamFactoryNode = child;
break;
}
}
if (contentHandlerFactoryNode == null || parserFactoryNode == null ||
outputStreamFactoryNode == null) {
throw new RuntimeException("You must specify a ContentHandlerFactory, " +
"a ParserFactory and an OutputStreamFactory");
}
ContentHandlerFactory contentHandlerFactory =
getContentHandlerFactory(contentHandlerFactoryNode, runtimeAttributes);
ParserFactory parserFactory = getParserFactory(parserFactoryNode, runtimeAttributes);
OutputStreamFactory outputStreamFactory =
getOutputStreamFactory(outputStreamFactoryNode, runtimeAttributes,
contentHandlerFactory, recursiveParserWrapper);
Parser parser = parserFactory.getParser(config);
if (recursiveParserWrapper) {
MetadataFilter metadataFilter = config.getMetadataFilter();
parser = new RecursiveParserWrapper(parser);
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = null;
if (streamOut) {
c = new StreamOutRPWFSConsumer(queue, parser, contentHandlerFactory,
outputStreamFactory, metadataFilter);
} else {
c = new RecursiveParserWrapperFSConsumer(queue, parser, contentHandlerFactory,
outputStreamFactory, metadataFilter);
}
consumers.add(c);
}
} else {
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c =
new BasicTikaFSConsumer(queue, parser, contentHandlerFactory,
outputStreamFactory);
consumers.add(c);
}
}
ConsumersManager manager = new FSConsumersManager(consumers);
if (consumersManagerMaxMillis != null) {
manager.setConsumersManagerMaxMillis(consumersManagerMaxMillis);
}
return manager;
}
private ContentHandlerFactory getContentHandlerFactory(Node node,
Map runtimeAttributes) {
Map localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
String className = localAttrs.get("builderClass");
if (className == null) {
throw new RuntimeException("Must specify builderClass for contentHandler");
}
IContentHandlerFactoryBuilder builder =
ClassLoaderUtil.buildClass(IContentHandlerFactoryBuilder.class, className);
return builder.build(node, runtimeAttributes);
}
private ParserFactory getParserFactory(Node node, Map runtimeAttributes) {
Map localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
String className = localAttrs.get("builderClass");
IParserFactoryBuilder builder =
ClassLoaderUtil.buildClass(IParserFactoryBuilder.class, className);
return builder.build(node, runtimeAttributes);
}
private OutputStreamFactory getOutputStreamFactory(Node node,
Map runtimeAttributes,
ContentHandlerFactory contentHandlerFactory,
boolean useRecursiveParserWrapper) {
Map attrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
Path outputDir = PropsUtil.getPath(attrs.get("outputDir"), null);
/* FSUtil.HANDLE_EXISTING handleExisting = null;
String handleExistingString = attrs.get("handleExisting");
if (handleExistingString == null) {
handleExistingException();
} else if (handleExistingString.equals("overwrite")){
handleExisting = FSUtil.HANDLE_EXISTING.OVERWRITE;
} else if (handleExistingString.equals("rename")) {
handleExisting = FSUtil.HANDLE_EXISTING.RENAME;
} else if (handleExistingString.equals("skip")) {
handleExisting = FSUtil.HANDLE_EXISTING.SKIP;
} else {
handleExistingException();
}
*/
String compressionString = attrs.get("compression");
FSOutputStreamFactory.COMPRESSION compression = FSOutputStreamFactory.COMPRESSION.NONE;
if (compressionString == null) {
//do nothing
} else if (compressionString.contains("bz")) {
compression = FSOutputStreamFactory.COMPRESSION.BZIP2;
} else if (compressionString.contains("gz")) {
compression = FSOutputStreamFactory.COMPRESSION.GZIP;
} else if (compressionString.contains("zip")) {
compression = FSOutputStreamFactory.COMPRESSION.ZIP;
}
String suffix = attrs.get("outputSuffix");
//suffix should not start with "."
if (suffix == null) {
StringBuilder sb = new StringBuilder();
if (useRecursiveParserWrapper) {
sb.append("json");
} else if (contentHandlerFactory instanceof BasicContentHandlerFactory) {
appendSuffix(((BasicContentHandlerFactory) contentHandlerFactory).getType(), sb);
}
appendCompression(compression, sb);
suffix = sb.toString();
}
//TODO: possibly open up the different handle-existings in the future
//but for now, lock it down to require skip. Too dangerous otherwise
//if the driver restarts and this is set to overwrite...
return new FSOutputStreamFactory(outputDir, FSUtil.HANDLE_EXISTING.SKIP, compression,
suffix);
}
private void appendCompression(FSOutputStreamFactory.COMPRESSION compression,
StringBuilder sb) {
switch (compression) {
case NONE:
break;
case ZIP:
sb.append(".zip");
break;
case BZIP2:
sb.append(".bz2");
break;
case GZIP:
sb.append(".gz");
break;
}
}
private void appendSuffix(BasicContentHandlerFactory.HANDLER_TYPE type, StringBuilder sb) {
switch (type) {
case XML:
sb.append("xml");
break;
case HTML:
sb.append("html");
break;
default:
sb.append("txt");
}
}
}