org.apache.phoenix.mapreduce.FormatToBytesWritableMapper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phoenix-client-hbase-1.6
Phoenix Client
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.phoenix.mapreduce;

import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;

import javax.annotation.Nullable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.phoenix.jdbc.PhoenixConnection;
import org.apache.phoenix.mapreduce.bulkload.TableRowkeyPair;
import org.apache.phoenix.mapreduce.bulkload.TargetTableRefFunctions;
import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil;
import org.apache.phoenix.query.QueryConstants;
import org.apache.phoenix.schema.PColumn;
import org.apache.phoenix.schema.PColumnFamily;
import org.apache.phoenix.schema.PTable;
import org.apache.phoenix.schema.PTable.ImmutableStorageScheme;
import org.apache.phoenix.util.ColumnInfo;
import org.apache.phoenix.util.EncodedColumnsUtil;
import org.apache.phoenix.util.PhoenixRuntime;
import org.apache.phoenix.util.QueryUtil;
import org.apache.phoenix.util.SchemaUtil;
import org.apache.phoenix.util.UpsertExecutor;
import org.apache.phoenix.hbase.index.IndexRegionObserver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

/**
 * Base class for converting some input source format into {@link ImmutableBytesWritable}s that
 * contains packed in a single byte array values for all columns.
 * Assumes input format is text-based, with one row per line. Depends on an online cluster
 * to retrieve {@link ColumnInfo} from the target table.
 */
public abstract class FormatToBytesWritableMapper extends Mapper {

    protected static final Logger LOGGER =
            LoggerFactory.getLogger(FormatToBytesWritableMapper.class);

    protected static final String COUNTER_GROUP_NAME = "Phoenix MapReduce Import";

    /** Configuration key for the name of the output table */
    public static final String TABLE_NAME_CONFKEY = "phoenix.mapreduce.import.tablename";

    /** Configuration key for the columns to be imported */
    public static final String COLUMN_INFO_CONFKEY = "phoenix.mapreduce.import.columninfos";

    /** Configuration key for the flag to ignore invalid rows */
    public static final String IGNORE_INVALID_ROW_CONFKEY = "phoenix.mapreduce.import.ignoreinvalidrow";

    /** Configuration key for the table names */
    public static final String TABLE_NAMES_CONFKEY = "phoenix.mapreduce.import.tablenames";

    /** Configuration key for the table logical names */
    public static final String LOGICAL_NAMES_CONFKEY = "phoenix.mapreduce.import.logicalnames";

    /**
     * Parses a single input line, returning a {@code T}.
     */
    public interface LineParser {
        T parse(String input) throws IOException;
    }

    protected PhoenixConnection conn;
    protected UpsertExecutor upsertExecutor;
    protected ImportPreUpsertKeyValueProcessor preUpdateProcessor;
    protected IndexStatusUpdater[] indexStatusUpdaters;
    protected List tableNames;
    protected List logicalNames;
    protected MapperUpsertListener upsertListener;

    /*
    lookup table for column index. Index in the List matches to the index in tableNames List
     */
    protected Map columnIndexes;

    protected abstract UpsertExecutor buildUpsertExecutor(Configuration conf);
    protected abstract LineParser getLineParser();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        Configuration conf = context.getConfiguration();

        // pass client configuration into driver
        Properties clientInfos = new Properties();
        for (Map.Entry entry : conf) {
            clientInfos.setProperty(entry.getKey(), entry.getValue());
        }

        try {
            conn = (PhoenixConnection) QueryUtil.getConnectionOnServer(clientInfos, conf);
            // We are dependent on rolling back before performing commits, so we need to be sure
            // that auto-commit is not turned on
            conn.setAutoCommit(false);

            final String tableNamesConf = conf.get(TABLE_NAMES_CONFKEY);
            final String logicalNamesConf = conf.get(LOGICAL_NAMES_CONFKEY);
            tableNames = TargetTableRefFunctions.NAMES_FROM_JSON.apply(tableNamesConf);
            logicalNames = TargetTableRefFunctions.NAMES_FROM_JSON.apply(logicalNamesConf);

            initColumnIndexes();
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }

        upsertListener = new MapperUpsertListener(
                context, conf.getBoolean(IGNORE_INVALID_ROW_CONFKEY, true));
        upsertExecutor = buildUpsertExecutor(conf);
        preUpdateProcessor = PhoenixConfigurationUtil.loadPreUpsertProcessor(conf);
    }

    @SuppressWarnings("deprecation")
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException,
            InterruptedException {
        if (conn == null) {
            throw new RuntimeException("Connection not initialized.");
        }
        try {
            RECORD record = null;
            try {
                record = getLineParser().parse(value.toString());
            } catch (IOException e) {
                context.getCounter(COUNTER_GROUP_NAME, "Parser errors").increment(1L);
                return;
            }

            if (record == null) {
                context.getCounter(COUNTER_GROUP_NAME, "Empty records").increment(1L);
                return;
            }
            upsertExecutor.execute(ImmutableList.of(record));
            Map> map = new HashMap<>();
            Iterator>> uncommittedDataIterator
                    = PhoenixRuntime.getUncommittedDataIterator(conn, true);
            while (uncommittedDataIterator.hasNext()) {
                Pair> kvPair = uncommittedDataIterator.next();
                List keyValueList = kvPair.getSecond();
                byte[] tableName = kvPair.getFirst();
                keyValueList = preUpdateProcessor.preUpsert(tableName, keyValueList);
                // Create a list of KV for each table
                for (int i = 0; i < tableNames.size(); i++) {
                    if (Bytes.compareTo(Bytes.toBytes(tableNames.get(i)), tableName) == 0) {
                        if (!map.containsKey(i)) {
                            map.put(i, new ArrayList());
                        }
                        List cellsForTable = map.get(i);
                        if (indexStatusUpdaters[i] != null) {
                            indexStatusUpdaters[i].setVerfied(keyValueList);
                        }
                        cellsForTable.addAll(keyValueList);
                        break;
                    }
                }
            }
            for (Map.Entry> rowEntry : map.entrySet()) {
                int tableIndex = rowEntry.getKey();
                List lkv = rowEntry.getValue();
                // All KV values combines to a single byte array
                writeAggregatedRow(context, tableNames.get(tableIndex), lkv);
            }
            conn.rollback();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    /*
    Map all unique pairs   to index. Table name is part of TableRowkey, so we do
    not care about it
     */
    private void initColumnIndexes() throws SQLException {
        columnIndexes = new TreeMap<>(Bytes.BYTES_COMPARATOR);
        indexStatusUpdaters = new IndexStatusUpdater[logicalNames.size()];
        int columnIndex = 0;
        for (int index = 0; index < logicalNames.size(); index++) {
            PTable table = PhoenixRuntime.getTable(conn, logicalNames.get(index));
            if (!table.getImmutableStorageScheme().equals(ImmutableStorageScheme.ONE_CELL_PER_COLUMN)) {
                List cfs = table.getColumnFamilies();
                for (int i = 0; i < cfs.size(); i++) {
                    byte[] family = cfs.get(i).getName().getBytes();
                    byte[] cfn = Bytes.add(family, QueryConstants.NAMESPACE_SEPARATOR_BYTES,
                            QueryConstants.SINGLE_KEYVALUE_COLUMN_QUALIFIER_BYTES);
                    columnIndexes.put(cfn, new Integer(columnIndex));
                    columnIndex++;
                }
            } else {
                List cls = table.getColumns();
                for (int i = 0; i < cls.size(); i++) {
                    PColumn c = cls.get(i);
                    byte[] family = new byte[0];
                    byte[] cq;
                    if (!SchemaUtil.isPKColumn(c)) {
                        family = c.getFamilyName().getBytes();
                        cq = c.getColumnQualifierBytes();
                    } else {
                        cq = c.getName().getBytes();
                    }
                    byte[] cfn = Bytes.add(family, QueryConstants.NAMESPACE_SEPARATOR_BYTES, cq);
                    if (!columnIndexes.containsKey(cfn)) {
                        columnIndexes.put(cfn, new Integer(columnIndex));
                        columnIndex++;
                    }
                }
            }
            byte[] emptyColumnFamily = SchemaUtil.getEmptyColumnFamily(table);
            byte[] emptyKeyValue = EncodedColumnsUtil.getEmptyKeyValueInfo(table).getFirst();
            byte[] cfn = Bytes.add(emptyColumnFamily, QueryConstants.NAMESPACE_SEPARATOR_BYTES, emptyKeyValue);
            columnIndexes.put(cfn, new Integer(columnIndex));
            columnIndex++;
            if (PTable.IndexType.GLOBAL == table.getIndexType()) {
                indexStatusUpdaters[index] =
                        new IndexStatusUpdater(emptyColumnFamily, emptyKeyValue);
            }
        }
    }

    /**
     * Find the column index which will replace the column name in
     * the aggregated array and will be restored in Reducer
     *
     * @param cell       KeyValue for the column
     * @return column index for the specified cell or -1 if was not found
     */
    private int findIndex(Cell cell) throws IOException {
        byte[] familyName = Bytes.copy(cell.getFamilyArray(), cell.getFamilyOffset(),
                cell.getFamilyLength());
        byte[] cq = Bytes.copy(cell.getQualifierArray(), cell.getQualifierOffset(),
                cell.getQualifierLength());
        byte[] cfn = Bytes.add(familyName, QueryConstants.NAMESPACE_SEPARATOR_BYTES, cq);
        if(columnIndexes.containsKey(cfn)) {
            return columnIndexes.get(cfn);
        }
        return -1;
    }

    /**
     * Collect all column values for the same Row. RowKey may be different if indexes are involved,
     * so it writes a separate record for each unique RowKey
     *
     * @param context    Current mapper context
     * @param tableName Table index in tableNames list
     * @param lkv        List of KV values that will be combined in a single ImmutableBytesWritable
     * @throws IOException
     * @throws InterruptedException
     */

    private void writeAggregatedRow(Context context, String tableName, List lkv)
            throws IOException, InterruptedException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
        DataOutputStream outputStream = new DataOutputStream(bos);
        ImmutableBytesWritable outputKey =null;
        if (!lkv.isEmpty()) {
            for (KeyValue cell : lkv) {
                if (outputKey == null || Bytes.compareTo(outputKey.get(), outputKey.getOffset(),
                        outputKey.getLength(), cell.getRowArray(), cell.getRowOffset(), cell
                                .getRowLength()) != 0) {
                    // This a the first RowKey or a different from previous
                    if (outputKey != null) { //It's a different RowKey, so we need to write it
                        ImmutableBytesWritable aggregatedArray =
                                new ImmutableBytesWritable(bos.toByteArray());
                        outputStream.close();
                        context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
                    }
                    outputKey = new ImmutableBytesWritable(cell.getRowArray(), cell.getRowOffset()
                            , cell.getRowLength());
                    bos = new ByteArrayOutputStream(1024);
                    outputStream = new DataOutputStream(bos);
                }
                /*
                The order of aggregation: type, index of column, length of value, value itself
                 */
                int i = findIndex(cell);
                if(i == -1) {
                    //That may happen when we load only local indexes. Since KV pairs for both
                    // table and local index are going to the same physical table at that point
                    // we skip those KVs that are not belongs to loca index
                    continue;
                }
                outputStream.writeByte(cell.getTypeByte());
                WritableUtils.writeVLong(outputStream,cell.getTimestamp());
                WritableUtils.writeVInt(outputStream, i);
                WritableUtils.writeVInt(outputStream, cell.getValueLength());
                outputStream.write(cell.getValueArray(), cell.getValueOffset(), cell.getValueLength());

            }
            ImmutableBytesWritable aggregatedArray = new ImmutableBytesWritable(bos.toByteArray());
            outputStream.close();
            context.write(new TableRowkeyPair(tableName, outputKey), aggregatedArray);
        }
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        try {
            if (conn != null) {
                conn.close();
            }
        } catch (SQLException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * Write the list of to-import columns to a job configuration.
     *
     * @param conf           configuration to be written to
     * @param columnInfoList list of ColumnInfo objects to be configured for import
     */
    @VisibleForTesting
    static void configureColumnInfoList(Configuration conf, List columnInfoList) {
        conf.set(COLUMN_INFO_CONFKEY, Joiner.on("|").useForNull("").join(columnInfoList));
    }

    /**
     * Build the list of ColumnInfos for the import based on information in the configuration.
     */
    @VisibleForTesting
    static List buildColumnInfoList(Configuration conf) {

        return Lists.newArrayList(
                Iterables.transform(
                        Splitter.on("|").split(conf.get(COLUMN_INFO_CONFKEY)),
                        new Function() {
                            @Nullable
                            @Override
                            public ColumnInfo apply(@Nullable String input) {
                                if (input == null || input.isEmpty()) {
                                    // An empty string represents a null that was passed in to
                                    // the configuration, which corresponds to an input column
                                    // which is to be skipped
                                    return null;
                                }
                                return ColumnInfo.fromString(input);
                            }
                        }));
    }

    /**
     * Listener that logs successful upserts and errors to job counters.
     */
    @VisibleForTesting
    static class MapperUpsertListener implements UpsertExecutor.UpsertListener {

        private final Mapper.Context context;
        private final boolean ignoreRecordErrors;

        private MapperUpsertListener(
                Mapper.Context context,
                boolean ignoreRecordErrors) {
            this.context = context;
            this.ignoreRecordErrors = ignoreRecordErrors;
        }

        @Override
        public void upsertDone(long upsertCount) {
            context.getCounter(COUNTER_GROUP_NAME, "Upserts Done").increment(1L);
        }

        @Override
        public void errorOnRecord(T record, Throwable throwable) {
            LOGGER.error("Error on record " + record, throwable);
            context.getCounter(COUNTER_GROUP_NAME, "Errors on records").increment(1L);
            if (!ignoreRecordErrors) {
                Throwables.propagate(throwable);
            }
        }
    }

    /**
     * A default implementation of {@code ImportPreUpsertKeyValueProcessor} that is used if no
     * specific class is configured. This implementation simply passes through the KeyValue
     * list that is passed in.
     */
    public static class DefaultImportPreUpsertKeyValueProcessor implements
            ImportPreUpsertKeyValueProcessor {

        @Override
        public List preUpsert(byte[] tableName, List keyValues) {
            return keyValues;
        }
    }

    /**
     * Updates the EMPTY cell value to VERIFIED for global index table rows
     */
    private static class IndexStatusUpdater {

        private final byte[] emptyKeyValueCF;
        private final int emptyKeyValueCFLength;
        private final byte[] emptyKeyValueQualifier;
        private final int emptyKeyValueQualifierLength;

        public IndexStatusUpdater(final byte[] emptyKeyValueCF, final byte[] emptyKeyValueQualifier) {
            this.emptyKeyValueCF = emptyKeyValueCF;
            this.emptyKeyValueQualifier = emptyKeyValueQualifier;
            emptyKeyValueCFLength = emptyKeyValueCF.length;
            emptyKeyValueQualifierLength = emptyKeyValueQualifier.length;
        }

        /**
         * Update the Empty cell values to VERIFIED in the passed keyValues list
         * 
         * @param keyValues will be modified
         */
        public void setVerfied(List keyValues) {
            for (int i = 0; i < keyValues.size() ; i++) {
                Cell kv = keyValues.get(i);
                if (CellUtil.matchingFamily(kv, emptyKeyValueCF, 0, emptyKeyValueCFLength)
                        && CellUtil.matchingQualifier(kv, emptyKeyValueQualifier, 0, emptyKeyValueQualifierLength)) {
                    if (kv.getValueLength() != 1) {
                        //This should never happen. Fail fast if it does.
                       throw new IllegalArgumentException("Empty cell value length is not 1");
                    }
                    //We are directly overwriting the value for performance
                    kv.getValueArray()[kv.getValueOffset()] = IndexRegionObserver.VERIFIED_BYTES[0];
                }
            }
        }
    }
}