org.apache.tika.pipes.emitter.fs.FileSystemEmitter Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.pipes.emitter.fs;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.util.List;
import org.apache.tika.config.Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.emitter.AbstractEmitter;
import org.apache.tika.pipes.emitter.StreamEmitter;
import org.apache.tika.pipes.emitter.TikaEmitterException;
import org.apache.tika.serialization.JsonMetadataList;
/**
* Emitter to write to a file system.
*
* This calculates the path to write to based on the {@link #basePath}
* and the value of the {@link TikaCoreProperties#SOURCE_PATH} value.
*
*
* <properties>
* <emitters>
* <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter>
* <params>
* <!-- required -->
* <param name="name" type="string">fs</param>
* <!-- required -->
* <param name="basePath" type="string">/path/to/output</param>
* <!-- optional; default is 'json' -->
* <param name="fileExtension" type="string">json</param>
* <!-- optional; if the file already exists,
* options ('skip', 'replace', 'exception')
* default is 'exception' -->
* <param name="onExists" type="string">skip</param>
* <!-- optional; whether or not to pretty print the output
* default is false -->
* <param name="prettyPrint" type="boolean">true</param>
* </params>
* </emitter>
* </emitters>
* </properties>
*/
public class FileSystemEmitter extends AbstractEmitter implements StreamEmitter {
private Path basePath = null;
private String fileExtension = "json";
private ON_EXISTS onExists = ON_EXISTS.EXCEPTION;
private boolean prettyPrint = false;
@Override
public void emit(String emitKey, List metadataList, ParseContext parseContext) throws IOException, TikaEmitterException {
Path output;
if (metadataList == null || metadataList.size() == 0) {
throw new TikaEmitterException("metadata list must not be null or of size 0");
}
if (fileExtension != null && fileExtension.length() > 0) {
emitKey += "." + fileExtension;
}
if (basePath != null) {
output = basePath.resolve(emitKey);
} else {
output = Paths.get(emitKey);
}
if (!Files.isDirectory(output.getParent())) {
Files.createDirectories(output.getParent());
}
try (Writer writer = Files.newBufferedWriter(output, StandardCharsets.UTF_8)) {
JsonMetadataList.toJson(metadataList, writer, prettyPrint);
}
}
@Field
public void setBasePath(String basePath) {
this.basePath = Paths.get(basePath);
}
/**
* If you want to customize the output file's file extension.
* Do not include the "."
*
* @param fileExtension
*/
@Field
public void setFileExtension(String fileExtension) {
this.fileExtension = fileExtension;
}
/**
* What to do if the target file already exists. NOTE: if more than one
* thread is trying write to the same file and {@link ON_EXISTS#REPLACE} is chosen,
* you still might get a {@link FileAlreadyExistsException}.
*
* @param onExists
*/
@Field
public void setOnExists(String onExists) {
switch (onExists) {
case "skip":
this.onExists = ON_EXISTS.SKIP;
break;
case "replace":
this.onExists = ON_EXISTS.REPLACE;
break;
case "exception":
this.onExists = ON_EXISTS.EXCEPTION;
break;
default:
throw new IllegalArgumentException("Don't understand '" + onExists + "'; must be one of: 'skip', 'replace', 'exception'");
}
}
@Field
public void setPrettyPrint(boolean prettyPrint) {
this.prettyPrint = prettyPrint;
}
@Override
public void emit(String path, InputStream inputStream, Metadata userMetadata, ParseContext parseContext) throws IOException, TikaEmitterException {
Path target = basePath.resolve(path);
if (!Files.isDirectory(target.getParent())) {
Files.createDirectories(target.getParent());
}
if (onExists == ON_EXISTS.REPLACE) {
Files.copy(inputStream, target, StandardCopyOption.REPLACE_EXISTING);
} else if (onExists == ON_EXISTS.EXCEPTION) {
Files.copy(inputStream, target);
} else if (onExists == ON_EXISTS.SKIP) {
if (!Files.isRegularFile(target)) {
try {
Files.copy(inputStream, target);
} catch (FileAlreadyExistsException e) {
//swallow
}
}
}
}
enum ON_EXISTS {
SKIP, EXCEPTION, REPLACE
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy