com.marklogic.contentpump.CompressedRDFReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mlcp Show documentation
MarkLogic Content Pump
There is a newer version: 11.3.1
/*
 * Copyright (c) 2023 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.mapreduce.InputSplit;
import com.marklogic.mapreduce.CompressionCodec;
import com.marklogic.mapreduce.LinkedMapWritable;

/**
 * Reader for Compressed RDF statements.
 * 
 * @author nwalsh
 *
 * @param 
 */
public class CompressedRDFReader extends RDFReader {
    public static final Log LOG = LogFactory.getLog(CompressedRDFReader.class);

    // When we're looking at compressed data, for the purposes of deciding if 
    // we should stream or not, we assume it'll be (compressedSize * 
    // COMPRESSIONFACTOR) when it's uncompressed.
    public static final long COMPRESSIONFACTOR = 2;

    private byte[] buf = new byte[65536];
    private InputStream zipIn;
    private ZipEntry currZipEntry;
    private CompressionCodec codec;
    
    @Override
    public void close() throws IOException {
        super.close();
        //close the zip
        if (zipIn != null) {
            zipIn.close();
        }
    }

    @Override
    protected void initStream(InputSplit inSplit)
            throws IOException, InterruptedException {
        FSDataInputStream fileIn = openFile(inSplit, false);
        if (fileIn == null) {
            return;
        }
        URI zipURI = file.toUri();
        String codecString = 
                conf.get(ConfigConstants.CONF_INPUT_COMPRESSION_CODEC, 
                        CompressionCodec.ZIP.toString());
        if (codecString.equalsIgnoreCase(CompressionCodec.ZIP.toString())) {
            zipIn = new ZipInputStream(fileIn);
            codec = CompressionCodec.ZIP;
            while (true) {
                try {
                    currZipEntry = ((ZipInputStream)zipIn).getNextEntry();
                    if (currZipEntry == null) {
                        break;
                    }
                    if (currZipEntry.getSize() != 0) {
                        subId = currZipEntry.getName();
                        break;
                    }
                } catch (IllegalArgumentException e) {
                    LOG.warn("Skipped a zip entry in : " + file.toUri()
                            + ", reason: " + e.getMessage());
                }
            }
            if (currZipEntry == null) { // no entry in zip
                LOG.warn("No valid entry in zip:" + file.toUri());
                return;
            }
            ByteArrayOutputStream baos;
            long size = currZipEntry.getSize();
            if (size == -1) {
                baos = new ByteArrayOutputStream();
                // if we don't know the size, assume it's big!
                initParser(zipURI.toASCIIString() + "/" + subId,
                        INMEMORYTHRESHOLD); 
            } else {
                baos = new ByteArrayOutputStream((int) size);
                initParser(zipURI.toASCIIString() + "/" + subId, size);
            }
            int nb;
            while ((nb = zipIn.read(buf, 0, buf.length)) != -1) {
                baos.write(buf, 0, nb);
            }
            parse(subId, new ByteArrayInputStream(baos.toByteArray()));
        } else if (codecString.equalsIgnoreCase(CompressionCodec.GZIP.toString())) {
            long size = inSplit.getLength();
            zipIn = new GZIPInputStream(fileIn);
            codec = CompressionCodec.GZIP;
            initParser(zipURI.toASCIIString(), size * COMPRESSIONFACTOR);
            parse(file.getName(), zipIn);
        } else {
            throw new UnsupportedOperationException("Unsupported codec: " + codec.name());
        }
    }

    protected void parse(String fsname, final InputStream in)
        throws IOException {
        if (dataset == null) {
            jenaStreamingParser = new RunnableParser(origFn, fsname, in, lang);
            // Previously during parsing, we create a task for each file
            // and submit it to a executor pool of size 1 for consecutive execution and performance.
            // It has been replaced by direct calls to the Jena parser because
            // Asyncparser manages its parsing in seperate thread. 
            jenaStreamingParser.run();
            // We don't know how many statements are in the model; we could
            // count them, but that's
            // possibly expensive. So we just say 0 until we're done.
            pos = 0;
            end = 1;
        } else {
            loadModel(fsname, in);
        }
    }
    
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean stillReading = super.nextKeyValue();
        if (stillReading) {
            return true;
        }

        // Ok, we've run out of data in the current file, are there more?
        URI zipURI = file.toUri();
        if (codec.equals(CompressionCodec.ZIP)) {
            ZipInputStream zis = (ZipInputStream) zipIn;

            ByteArrayOutputStream baos;
            while ((currZipEntry = zis.getNextEntry()) != null) {
                if (currZipEntry.getSize() == 0) {
                    continue;
                }

                long size = currZipEntry.getSize();
                if (size == -1) {
                    baos = new ByteArrayOutputStream();
                    // if we don't know the size, assume it's big!
                    initParser(zipURI.toASCIIString() + "/" + currZipEntry.getName(), INMEMORYTHRESHOLD);
                } else {
                    baos = new ByteArrayOutputStream((int) size);
                    initParser(zipURI.toASCIIString() + "/" + currZipEntry.getName(), size);
                }
                int nb;
                while ((nb = zis.read(buf, 0, buf.length)) != -1) {
                    baos.write(buf, 0, nb);
                }

                parse(currZipEntry.getName(), new ByteArrayInputStream(baos.toByteArray()));
                boolean gotTriples = super.nextKeyValue();
                if (gotTriples) {
                    return true;
                }
            }
            // end of zip
            if (iterator != null && iterator.hasNext()) {
                close();
                initStream(iterator.next());
                return super.nextKeyValue();
            }

            return false;
        } else {
            return false;
        }
    }

    public CompressedRDFReader(String version, LinkedMapWritable roleMap) {
        super(version, roleMap);
        compressed = true;
    }
    
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return hasNext ? 0 : 1;
    }
    
}