All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.phoenix.mapreduce.CsvToKeyValueMapper Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.mapreduce;

import java.io.IOException;
import java.io.StringReader;
import java.util.List;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.phoenix.util.CSVCommonsLoader;
import org.apache.phoenix.util.ColumnInfo;
import org.apache.phoenix.util.UpsertExecutor;
import org.apache.phoenix.util.csv.CsvUpsertExecutor;

import org.apache.phoenix.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.phoenix.thirdparty.com.google.common.base.Preconditions;
import org.apache.phoenix.thirdparty.com.google.common.collect.Iterables;

/**
 * MapReduce mapper that converts CSV input lines into KeyValues that can be written to HFiles.
 * 
 * KeyValues are produced by executing UPSERT statements on a Phoenix connection and then
 * extracting the created KeyValues and rolling back the statement execution before it is
 * committed to HBase.
 */
public class CsvToKeyValueMapper extends FormatToBytesWritableMapper {

    /** Configuration key for the field delimiter for input csv records */
    public static final String FIELD_DELIMITER_CONFKEY = "phoenix.mapreduce.import.fielddelimiter";

    /** Configuration key for the quote char for input csv records */
    public static final String QUOTE_CHAR_CONFKEY = "phoenix.mapreduce.import.quotechar";

    /** Configuration key for the escape char for input csv records */
    public static final String ESCAPE_CHAR_CONFKEY = "phoenix.mapreduce.import.escapechar";

    /** Configuration key for the array element delimiter for input arrays */
    public static final String ARRAY_DELIMITER_CONFKEY = "phoenix.mapreduce.import.arraydelimiter";

    private CsvLineParser lineParser;

    @Override
    protected LineParser getLineParser() {
        return lineParser;
    }

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
        Configuration conf = context.getConfiguration();
        lineParser = new CsvLineParser(
                CsvBulkImportUtil.getCharacter(conf, FIELD_DELIMITER_CONFKEY),
                CsvBulkImportUtil.getCharacter(conf, QUOTE_CHAR_CONFKEY),
                CsvBulkImportUtil.getCharacter(conf, ESCAPE_CHAR_CONFKEY));
    }

    @VisibleForTesting
    @Override
    protected UpsertExecutor buildUpsertExecutor(Configuration conf) {
        String tableName = conf.get(TABLE_NAME_CONFKEY);
        String arraySeparator = conf.get(ARRAY_DELIMITER_CONFKEY,
                                            CSVCommonsLoader.DEFAULT_ARRAY_ELEMENT_SEPARATOR);
        Preconditions.checkNotNull(tableName, "table name is not configured");

        List columnInfoList = buildColumnInfoList(conf);

        return new CsvUpsertExecutor(conn, tableName, columnInfoList, upsertListener, arraySeparator);
    }

    /**
     * Parses a single CSV input line, returning a {@code CSVRecord}.
     */
    @VisibleForTesting
    static class CsvLineParser implements LineParser {
        private final CSVFormat csvFormat;

        CsvLineParser(Character fieldDelimiter, Character quote, Character escape) {
            this.csvFormat = CSVFormat.DEFAULT
                    .withIgnoreEmptyLines(true)
                    .withDelimiter(fieldDelimiter)
                    .withEscape(escape)
                    .withQuote(quote);
        }

        @Override
        public CSVRecord parse(String input) throws IOException {
            // TODO Creating a new parser for each line seems terribly inefficient but
            // there's no public way to parse single lines via commons-csv. We should update
            // it to create a LineParser class like this one.
            CSVParser csvParser = new CSVParser(new StringReader(input), csvFormat);
            return Iterables.getFirst(csvParser, null);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy