com.marklogic.contentpump.DelimitedTextInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mlcp Show documentation
MarkLogic Content Pump
There is a newer version: 11.3.1
/*
 * Copyright (c) 2019 MarkLogic Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.marklogic.contentpump;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DefaultStringifier;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.marklogic.contentpump.utilities.CSVParserFormatter;
import com.marklogic.contentpump.utilities.DelimitedSplit;
import com.marklogic.contentpump.utilities.DocBuilder;
import com.marklogic.contentpump.utilities.EncodingUtil;
import com.marklogic.mapreduce.DocumentURIWithSourceInfo;
import com.marklogic.mapreduce.MarkLogicConstants;
import com.marklogic.mapreduce.utilities.TextArrayWritable;

/**
 * InputFormat for delimited text. Each line after metadata(1st line) is a
 * record.
 * 
 * @author ali
 * 
 */
public class DelimitedTextInputFormat extends
FileAndDirectoryInputFormat {
    public static final Log LOG = LogFactory.getLog(
            DelimitedTextInputFormat.class);
    @Override
    public RecordReader createRecordReader(
            InputSplit split, TaskAttemptContext context) 
            throws IOException, InterruptedException {
        if (isSplitInput(context.getConfiguration())) {
            return new SplitDelimitedTextReader();
        } else {
            return new DelimitedTextReader();
        }
    }

    
    private boolean isSplitInput(Configuration conf ) {
        return conf.getBoolean(
            ConfigConstants.CONF_SPLIT_INPUT, false);
    }

    public List getSplits(JobContext job) throws IOException {
        boolean delimSplit = isSplitInput(job.getConfiguration());
        //if delimSplit is true, size of each split is determined by 
        //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat
        List splits = super.getSplits(job);
        if (!delimSplit) {
            return splits;
        }

        if (splits.size()>= SPLIT_COUNT_LIMIT) {
            //if #splits > 1 million, there is enough parallelism
            //therefore no point to split
            LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:"
                + SPLIT_COUNT_LIMIT);
            DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT);
            return splits;
        }
        // add header info into splits
        List populatedSplits = new ArrayList();
        LOG.info(splits.size() + " DelimitedSplits generated");
        Configuration conf = job.getConfiguration();
        char delimiter =0;
        ArrayList hlist = new ArrayList();
        for (InputSplit file: splits) {
            FileSplit fsplit = ((FileSplit)file);
            Path path = fsplit.getPath();
            FileSystem fs = path.getFileSystem(conf);
            
            if (fsplit.getStart() == 0) {
            // parse the inSplit, get the header
                FSDataInputStream fileIn = fs.open(path);

                String delimStr = conf.get(ConfigConstants.CONF_DELIMITER,
                    ConfigConstants.DEFAULT_DELIMITER);
                if (delimStr.length() == 1) {
                    delimiter = delimStr.charAt(0);
                } else {
                    LOG.error("Incorrect delimitor: " + delimiter
                        + ". Expects single character.");
                }
                String encoding = conf.get(
                    MarkLogicConstants.OUTPUT_CONTENT_ENCODING,
                    MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING);
                InputStreamReader instream = new InputStreamReader(fileIn, encoding);
                CSVParser parser = new CSVParser(instream, CSVParserFormatter.
                		getFormat(delimiter, DelimitedTextReader.encapsulator,
                				true, true));
                Iterator it = parser.iterator();
                
                String[] header = null;
                if (it.hasNext()) {
                	CSVRecord record = (CSVRecord)it.next();
                	Iterator recordIterator = record.iterator();
                    int recordSize = record.size();
                    header = new String[recordSize];
                    for (int i = 0; i < recordSize; i++) {
                    	if (recordIterator.hasNext()) {
                    		header[i] = (String)recordIterator.next();
                    	} else {
                    		throw new IOException("Record size doesn't match the real size");
                    	}
                    }
                    
                    EncodingUtil.handleBOMUTF8(header, 0);
                    
                    hlist.clear();
                    for (String s : header) {
                        hlist.add(new Text(s));
                    }
                }
                instream.close();
            }
            
            DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(
                hlist.toArray(new Text[hlist.size()])), path,
                fsplit.getStart(), fsplit.getLength(),
                fsplit.getLocations());
            populatedSplits.add(ds);
        }
        
        return populatedSplits;
    }
    
}