Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io.orc;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.orc.TypeDescription;
/**
* An ORC file writer. The file is divided into stripes, which is the natural
* unit of work when reading. Each stripe is buffered in memory until the
* memory reaches the stripe size and then it is written out broken down by
* columns. Each column is written by a TreeWriter that is specific to that
* type of column. TreeWriters may have children TreeWriters that handle the
* sub-types. Each of the TreeWriters writes the column's data as a set of
* streams.
*
* This class is unsynchronized like most Stream objects, so from the creation of an OrcFile and all
* access to a single instance has to be from a single thread.
*
* There are no known cases where these happen between different threads today.
*
* Caveat: the MemoryManager is created during WriterOptions create, that has to be confined to a single
* thread as well.
*
*/
public class WriterImpl extends org.apache.orc.impl.WriterImpl implements Writer {
private final ObjectInspector inspector;
private final VectorizedRowBatch internalBatch;
private final StructField[] fields;
WriterImpl(FileSystem fs,
Path path,
OrcFile.WriterOptions opts) throws IOException {
super(fs, path, opts);
this.inspector = opts.getInspector();
boolean useDecimal64ColumnVectors = opts.getConfiguration() != null &&
HiveConf.getVar(opts.getConfiguration(), HiveConf.ConfVars.HIVE_VECTORIZED_INPUT_FORMAT_SUPPORTS_ENABLED)
.equalsIgnoreCase("decimal_64");
if (useDecimal64ColumnVectors) {
this.internalBatch = opts.getSchema().createRowBatch(TypeDescription.RowBatchVersion.USE_DECIMAL64,
opts.getBatchSize());
} else {
this.internalBatch = opts.getSchema().createRowBatch(opts.getBatchSize());
}
this.fields = initializeFieldsFromOi(inspector);
}
private static StructField[] initializeFieldsFromOi(ObjectInspector inspector) {
if (inspector instanceof StructObjectInspector) {
List fieldList =
((StructObjectInspector) inspector).getAllStructFieldRefs();
StructField[] fields = new StructField[fieldList.size()];
fieldList.toArray(fields);
return fields;
} else {
return null;
}
}
/**
* Set the value for a given column value within a batch.
* @param rowId the row to set
* @param column the column to set
* @param inspector the object inspector to interpret the obj
* @param obj the value to use
*/
static void setColumn(int rowId, ColumnVector column,
ObjectInspector inspector, Object obj) {
if (obj == null) {
column.noNulls = false;
column.isNull[rowId] = true;
} else {
switch (inspector.getCategory()) {
case PRIMITIVE:
switch (((PrimitiveObjectInspector) inspector)
.getPrimitiveCategory()) {
case BOOLEAN: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] =
((BooleanObjectInspector) inspector).get(obj) ? 1 : 0;
break;
}
case BYTE: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] = ((ByteObjectInspector) inspector).get(obj);
break;
}
case SHORT: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] =
((ShortObjectInspector) inspector).get(obj);
break;
}
case INT: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] = ((IntObjectInspector) inspector).get(obj);
break;
}
case LONG: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] = ((LongObjectInspector) inspector).get(obj);
break;
}
case FLOAT: {
DoubleColumnVector vector = (DoubleColumnVector) column;
vector.vector[rowId] =
((FloatObjectInspector) inspector).get(obj);
break;
}
case DOUBLE: {
DoubleColumnVector vector = (DoubleColumnVector) column;
vector.vector[rowId] =
((DoubleObjectInspector) inspector).get(obj);
break;
}
case BINARY: {
BytesColumnVector vector = (BytesColumnVector) column;
BytesWritable blob = ((BinaryObjectInspector) inspector)
.getPrimitiveWritableObject(obj);
vector.setVal(rowId, blob.getBytes(), 0, blob.getLength());
break;
}
case STRING: {
BytesColumnVector vector = (BytesColumnVector) column;
Text blob = ((StringObjectInspector) inspector)
.getPrimitiveWritableObject(obj);
vector.setVal(rowId, blob.getBytes(), 0, blob.getLength());
break;
}
case VARCHAR: {
BytesColumnVector vector = (BytesColumnVector) column;
Text blob = ((HiveVarcharObjectInspector) inspector)
.getPrimitiveWritableObject(obj).getTextValue();
vector.setVal(rowId, blob.getBytes(), 0, blob.getLength());
break;
}
case CHAR: {
BytesColumnVector vector = (BytesColumnVector) column;
Text blob = ((HiveCharObjectInspector) inspector)
.getPrimitiveWritableObject(obj).getTextValue();
vector.setVal(rowId, blob.getBytes(), 0, blob.getLength());
break;
}
case TIMESTAMP: {
TimestampColumnVector vector = (TimestampColumnVector) column;
vector.setIsUTC(true);
vector.set(rowId, ((TimestampObjectInspector) inspector)
.getPrimitiveJavaObject(obj).toSqlTimestamp());
break;
}
case DATE: {
LongColumnVector vector = (LongColumnVector) column;
vector.vector[rowId] = ((DateObjectInspector) inspector)
.getPrimitiveWritableObject(obj).getDays();
break;
}
case DECIMAL: {
if (column instanceof Decimal64ColumnVector) {
Decimal64ColumnVector vector = (Decimal64ColumnVector) column;
vector.set(rowId, ((HiveDecimalObjectInspector) inspector)
.getPrimitiveWritableObject(obj));
} else {
DecimalColumnVector vector = (DecimalColumnVector) column;
vector.set(rowId, ((HiveDecimalObjectInspector) inspector)
.getPrimitiveWritableObject(obj));
}
break;
}
}
break;
case STRUCT: {
StructColumnVector vector = (StructColumnVector) column;
StructObjectInspector oi = (StructObjectInspector) inspector;
List fields = oi.getAllStructFieldRefs();
for (int c = 0; c < vector.fields.length; ++c) {
StructField field = fields.get(c);
setColumn(rowId, vector.fields[c], field.getFieldObjectInspector(),
oi.getStructFieldData(obj, field));
}
break;
}
case UNION: {
UnionColumnVector vector = (UnionColumnVector) column;
UnionObjectInspector oi = (UnionObjectInspector) inspector;
int tag = oi.getTag(obj);
vector.tags[rowId] = tag;
setColumn(rowId, vector.fields[tag],
oi.getObjectInspectors().get(tag), oi.getField(obj));
break;
}
case LIST: {
ListColumnVector vector = (ListColumnVector) column;
ListObjectInspector oi = (ListObjectInspector) inspector;
int offset = vector.childCount;
int length = oi.getListLength(obj);
vector.offsets[rowId] = offset;
vector.lengths[rowId] = length;
vector.child.ensureSize(offset + length, true);
vector.childCount += length;
for (int c = 0; c < length; ++c) {
setColumn(offset + c, vector.child,
oi.getListElementObjectInspector(),
oi.getListElement(obj, c));
}
break;
}
case MAP: {
MapColumnVector vector = (MapColumnVector) column;
MapObjectInspector oi = (MapObjectInspector) inspector;
int offset = vector.childCount;
Set map = oi.getMap(obj).entrySet();
int length = map.size();
vector.offsets[rowId] = offset;
vector.lengths[rowId] = length;
vector.keys.ensureSize(offset + length, true);
vector.values.ensureSize(offset + length, true);
vector.childCount += length;
for (Object item: map) {
Map.Entry pair = (Map.Entry) item;
setColumn(offset, vector.keys, oi.getMapKeyObjectInspector(),
pair.getKey());
setColumn(offset, vector.values, oi.getMapValueObjectInspector(),
pair.getValue());
offset += 1;
}
break;
}
default:
throw new IllegalArgumentException("Unknown ObjectInspector kind " +
inspector.getCategory());
}
}
}
void flushInternalBatch() throws IOException {
if (internalBatch.size != 0) {
super.addRowBatch(internalBatch);
internalBatch.reset();
}
}
@Override
public void addRow(Object row) throws IOException {
int rowId = internalBatch.size++;
if (fields != null) {
StructObjectInspector soi = (StructObjectInspector) inspector;
for(int i=0; i < fields.length; ++i) {
setColumn(rowId, internalBatch.cols[i],
fields[i].getFieldObjectInspector(),
soi.getStructFieldData(row, fields[i]));
}
} else {
setColumn(rowId, internalBatch.cols[0], inspector, row);
}
if (internalBatch.size == internalBatch.getMaxSize()) {
flushInternalBatch();
}
}
@Override
public long writeIntermediateFooter() throws IOException {
flushInternalBatch();
return super.writeIntermediateFooter();
}
@Override
public void addRowBatch(VectorizedRowBatch batch) throws IOException {
flushInternalBatch();
super.addRowBatch(batch);
}
@Override
public void close() throws IOException {
flushInternalBatch();
super.close();
}
}