All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io.orc;

import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;

import org.apache.orc.CompressionKind;
import org.apache.orc.TypeDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.AcidOutputFormat;
import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.IOConstants;
import org.apache.hadoop.hive.ql.io.RecordUpdater;
import org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde.OrcSerdeRow;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;

/**
 * A Hive OutputFormat for ORC files.
 */
public class OrcOutputFormat extends FileOutputFormat
                        implements AcidOutputFormat {

  private static final Logger LOG = LoggerFactory.getLogger(OrcOutputFormat.class);

  private static class OrcRecordWriter
      implements RecordWriter,
                 StatsProvidingRecordWriter {
    private Writer writer = null;
    private final Path path;
    private final OrcFile.WriterOptions options;
    private final SerDeStats stats;

    OrcRecordWriter(Path path, OrcFile.WriterOptions options) {
      this.path = path;
      this.options = options;
      this.stats = new SerDeStats();
    }

    @Override
    public void write(NullWritable nullWritable,
                      OrcSerdeRow row) throws IOException {
      if (writer == null) {
        options.inspector(row.getInspector());
        writer = OrcFile.createWriter(path, options);
      }
      writer.addRow(row.getRow());
    }

    @Override
    public void write(Writable row) throws IOException {
      OrcSerdeRow serdeRow = (OrcSerdeRow) row;
      if (writer == null) {
        options.inspector(serdeRow.getInspector());
        writer = OrcFile.createWriter(path, options);
      }
      writer.addRow(serdeRow.getRow());
    }

    @Override
    public void close(Reporter reporter) throws IOException {
      close(true);
    }

    @Override
    public void close(boolean b) throws IOException {
      if (writer == null) {
        // we are closing a file without writing any data in it
        FileSystem fs = options.getFileSystem() == null ?
            path.getFileSystem(options.getConfiguration()) : options.getFileSystem();
        fs.createNewFile(path);
        return;
      }
      writer.close();
    }

    @Override
    public SerDeStats getStats() {
      stats.setRawDataSize(null == writer ? 0 : writer.getRawDataSize());
      stats.setRowCount(null == writer ? 0 : writer.getNumberOfRows());
      return stats;
    }
  }

  private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) {
    OrcFile.WriterOptions result = OrcFile.writerOptions(props, conf);
    if (props != null) {
      final String columnNameProperty =
          props.getProperty(IOConstants.COLUMNS);
      final String columnTypeProperty =
          props.getProperty(IOConstants.COLUMNS_TYPES);
      if (columnNameProperty != null &&
          !columnNameProperty.isEmpty() &&
          columnTypeProperty != null &&
          !columnTypeProperty.isEmpty()) {
        List columnNames;
        List columnTypes;
        final String columnNameDelimiter = props.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? props
            .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
        if (columnNameProperty.length() == 0) {
          columnNames = new ArrayList();
        } else {
          columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
        }

        if (columnTypeProperty.length() == 0) {
          columnTypes = new ArrayList();
        } else {
          columnTypes =
              TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
        }

        TypeDescription schema = TypeDescription.createStruct();
        for (int i = 0; i < columnNames.size(); ++i) {
          schema.addField(columnNames.get(i),
              OrcInputFormat.convertTypeInfo(columnTypes.get(i)));
        }
        if (LOG.isDebugEnabled()) {
          LOG.debug("ORC schema = " + schema);
        }
        result.setSchema(schema);
      }
    }
    return result;
  }

  @Override
  public RecordWriter
  getRecordWriter(FileSystem fileSystem, JobConf conf, String name,
                  Progressable reporter) throws IOException {
    return new
        OrcRecordWriter(new Path(name), getOptions(conf, null));
  }


  @Override
  public StatsProvidingRecordWriter
     getHiveRecordWriter(JobConf conf,
                         Path path,
                         Class valueClass,
                         boolean isCompressed,
                         Properties tableProperties,
                         Progressable reporter) throws IOException {
    return new OrcRecordWriter(path, getOptions(conf, tableProperties));
  }

  private class DummyOrcRecordUpdater implements RecordUpdater {
    private final Path path;
    private final ObjectInspector inspector;
    private final PrintStream out;

    private DummyOrcRecordUpdater(Path path, Options options) {
      this.path = path;
      this.inspector = options.getInspector();
      this.out = options.getDummyStream();
    }

    @Override
    public void insert(long currentWriteId, Object row) throws IOException {
      out.println("insert " + path + " currWriteId: " + currentWriteId +
          " obj: " + stringifyObject(row, inspector));
    }

    @Override
    public void update(long currentWriteId, Object row) throws IOException {
      out.println("update " + path + " currWriteId: " + currentWriteId +
          " obj: " + stringifyObject(row, inspector));
    }

    @Override
    public void delete(long currentWriteId, Object row) throws IOException {
      out.println("delete " + path + " currWriteId: " + currentWriteId + " obj: " + row);
    }

    @Override
    public void flush() throws IOException {
      out.println("flush " + path);
    }

    @Override
    public void close(boolean abort) throws IOException {
      out.println("close " + path);
    }

    @Override
    public SerDeStats getStats() {
      return null;
    }

    @Override
    public long getBufferedRowCount() {
      return 0;
    }

    private void stringifyObject(StringBuilder buffer,
                                 Object obj,
                                 ObjectInspector inspector
                                ) throws IOException {
      if (inspector instanceof StructObjectInspector) {
        buffer.append("{ ");
        StructObjectInspector soi = (StructObjectInspector) inspector;
        boolean isFirst = true;
        for(StructField field: soi.getAllStructFieldRefs()) {
          if (isFirst) {
            isFirst = false;
          } else {
            buffer.append(", ");
          }
          buffer.append(field.getFieldName());
          buffer.append(": ");
          stringifyObject(buffer, soi.getStructFieldData(obj, field),
              field.getFieldObjectInspector());
        }
        buffer.append(" }");
      } else if (inspector instanceof PrimitiveObjectInspector) {
        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) inspector;
        buffer.append(poi.getPrimitiveJavaObject(obj).toString());
      } else {
        buffer.append("*unknown*");
      }
    }

    private String stringifyObject(Object obj,
                                   ObjectInspector inspector
                                  ) throws IOException {
      StringBuilder buffer = new StringBuilder();
      stringifyObject(buffer, obj, inspector);
      return buffer.toString();
    }
  }

  @Override
  public RecordUpdater getRecordUpdater(Path path,
                                        Options options) throws IOException {
    if (options.getDummyStream() != null) {
      return new DummyOrcRecordUpdater(path, options);
    } else {
      return new OrcRecordUpdater(path, options);
    }
  }

  @Override
  public org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter
        getRawRecordWriter(Path path, Options options) throws IOException {
    final Path filename = AcidUtils.createFilename(path, options);
    final OrcFile.WriterOptions opts =
        OrcFile.writerOptions(options.getTableProperties(), options.getConfiguration());
    if (!options.isWritingBase()) {
      opts.bufferSize(OrcRecordUpdater.DELTA_BUFFER_SIZE)
          .stripeSize(OrcRecordUpdater.DELTA_STRIPE_SIZE)
          .blockPadding(false)
          .compress(CompressionKind.NONE)
          .rowIndexStride(0);
    }
    final OrcRecordUpdater.KeyIndexBuilder watcher =
        new OrcRecordUpdater.KeyIndexBuilder("compactor");
    opts.inspector(options.getInspector())
        .callback(watcher);
    final Writer writer = OrcFile.createWriter(filename, opts);
    AcidUtils.OrcAcidVersion.setAcidVersionInDataFile(writer);
    return new org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter() {
      @Override
      public void write(Writable w) throws IOException {
        OrcStruct orc = (OrcStruct) w;
        watcher.addKey(
            ((IntWritable) orc.getFieldValue(OrcRecordUpdater.OPERATION)).get(),
            ((LongWritable)
                orc.getFieldValue(OrcRecordUpdater.ORIGINAL_WRITEID)).get(),
            ((IntWritable) orc.getFieldValue(OrcRecordUpdater.BUCKET)).get(),
            ((LongWritable) orc.getFieldValue(OrcRecordUpdater.ROW_ID)).get());
        writer.addRow(w);
      }

      @Override
      public void close(boolean abort) throws IOException {
        writer.close();
      }
    };
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy