com.marklogic.contentpump.ArchiveWriter Maven / Gradle / Ivy
/*
* Copyright (c) 2020 MarkLogic Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.marklogic.contentpump;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import com.marklogic.mapreduce.ContentType;
import com.marklogic.mapreduce.DocumentURI;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.MarkLogicDocument;
import com.marklogic.mapreduce.utilities.URIUtil;
/**
* RecordWriter that writes to zip files.
*
* @author jchen
*/
public class ArchiveWriter extends RecordWriter
implements MarkLogicConstants, ConfigConstants {
public static final Log LOG = LogFactory.getLog(ArchiveWriter.class);
private String dir;
private TaskAttemptContext context;
/**
* Archive for Text
*/
private OutputArchive txtArchive;
/**
* Archive for XML
*/
private OutputArchive xmlArchive;
/**
* Archive for JSON
*/
private OutputArchive jsonArchive;
/**
* Archive for Binary
*/
private OutputArchive binaryArchive;
/**
* is exporting docs
*/
private boolean isExportDoc;
private String encoding;
public ArchiveWriter(Path path, TaskAttemptContext context) {
dir = path.toString();
this.context = context;
Configuration conf = context.getConfiguration();
encoding = conf.get(OUTPUT_CONTENT_ENCODING, DEFAULT_ENCODING);
String type = conf.get(CONF_OUTPUT_TYPE, DEFAULT_OUTPUT_TYPE);
ExportOutputType outputType = ExportOutputType.valueOf(
type.toUpperCase());
isExportDoc = ExportOutputType.DOCUMENT.equals(outputType);
}
@Override
public void close(TaskAttemptContext arg0) throws IOException,
InterruptedException {
if (txtArchive != null) {
txtArchive.close();
}
if (xmlArchive != null) {
xmlArchive.close();
}
if (jsonArchive != null) {
jsonArchive.close();
}
if (binaryArchive != null) {
binaryArchive.close();
}
}
@Override
public void write(DocumentURI uri, MarkLogicDocument content)
throws IOException, InterruptedException {
ContentType type = content.getContentType();
if(type == null) {
throw new IOException ("null content type: ");
}
Configuration conf = context.getConfiguration();
String dst = null;
String mode = conf.get(MarkLogicConstants.EXECUTION_MODE);
Date date = new Date();
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmssZ");
String timestamp = sdf.format(date);
if (mode.equals(MODE_DISTRIBUTED)) {
dst = dir + "/" + context.getTaskAttemptID().getTaskID().getId()
+ "-" + timestamp + "-" + type.toString();
} else if (mode.equals(MODE_LOCAL)) {
dst = dir + "/" + timestamp + "-" + type.toString();
}
// Decode URI if exporting documents in compressed form.
String zipEntryName = isExportDoc ? URIUtil.getPathFromURI(uri) :
uri.getUri();
if (zipEntryName == null) {
if (isExportDoc) {
LOG.error("Error parsing URI, skipping: " + uri);
} else {
LOG.error("Found document with empty URI.");
}
return;
}
if (ContentType.BINARY.equals(type)) {
if (binaryArchive == null) {
binaryArchive = new OutputArchive(dst, conf);
}
if (!isExportDoc) {
binaryArchive.write(zipEntryName + DocumentMetadata.EXTENSION,
((DatabaseDocumentWithMeta) content).getMeta().toXML()
.getBytes(encoding), false);
}
if (content.isStreamable()) {
InputStream is = null;
try {
long size = content.getContentSize();
is = content.getContentAsByteStream();
binaryArchive.write(zipEntryName, is, size, isExportDoc);
} finally {
if (is != null) {
is.close();
}
}
} else {
binaryArchive.write(zipEntryName,
content.getContentAsByteArray(), isExportDoc);
}
} else if (ContentType.TEXT.equals(type)) {
if(txtArchive == null) {
txtArchive = new OutputArchive(dst, conf);
}
if (!isExportDoc) {
txtArchive.write(zipEntryName + DocumentMetadata.EXTENSION,
((DatabaseDocumentWithMeta) content).getMeta().toXML()
.getBytes(encoding), false);
}
String text = content.getContentAsString();
txtArchive.write(zipEntryName, text.getBytes(encoding),
isExportDoc);
} else if (ContentType.XML.equals(type)) {
if(xmlArchive == null) {
xmlArchive = new OutputArchive(dst, conf);
}
if (!isExportDoc) {
if (((DatabaseDocumentWithMeta) content).getMeta().isNakedProps) {
xmlArchive.write(zipEntryName + DocumentMetadata.NAKED,
((DatabaseDocumentWithMeta) content).getMeta()
.toXML().getBytes(encoding), false);
} else {
xmlArchive.write(
zipEntryName + DocumentMetadata.EXTENSION,
((DatabaseDocumentWithMeta) content).getMeta()
.toXML().getBytes(encoding), isExportDoc);
xmlArchive.write(zipEntryName,
content.getContentAsString().getBytes(encoding),
isExportDoc);
}
} else {
String doc = content.getContentAsString();
if (doc == null) {
LOG.error("Empty document for " + zipEntryName);
return;
}
xmlArchive.write(zipEntryName, doc.getBytes(encoding),
isExportDoc);
}
} else if (ContentType.JSON.equals(type)) {
if (jsonArchive == null) {
jsonArchive = new OutputArchive(dst, conf);
}
if (!isExportDoc) {
jsonArchive.write(zipEntryName + DocumentMetadata.EXTENSION,
((DatabaseDocumentWithMeta) content).getMeta()
.toXML().getBytes(encoding), isExportDoc);
jsonArchive.write(zipEntryName,
content.getContentAsString().getBytes(encoding),
isExportDoc);
} else {
String doc = content.getContentAsString();
if (doc == null) {
LOG.error("Empty document for " + zipEntryName);
return;
}
jsonArchive.write(zipEntryName, doc.getBytes(encoding),
isExportDoc);
}
} else {
LOG.error("Skipping " + uri + ". Unsupported content type: "
+ type.name());
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy