org.apache.hadoop.hive.serde2.MultiDelimitSerDe Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* License); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.serde2;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.lazy.ByteArrayRef;
import org.apache.hadoop.hive.serde2.lazy.LazyFactory;
import org.apache.hadoop.hive.serde2.lazy.LazyStruct;
import org.apache.hadoop.hive.serde2.lazy.LazyUtils;
import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* This SerDe allows user to use multiple characters as the field delimiter for a table.
* To use this SerDe, user has to specify field.delim in SERDEPROPERTIES.
* If the table contains a column which is a collection or map, user can optionally
* specify collection.delim or mapkey.delim as well.
* Currently field.delim can be multiple character while collection.delim
* and mapkey.delim should be just single character.
*/
@SerDeSpec(schemaProps = {
serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
serdeConstants.FIELD_DELIM, serdeConstants.COLLECTION_DELIM, serdeConstants.MAPKEY_DELIM,
serdeConstants.SERIALIZATION_FORMAT, serdeConstants.SERIALIZATION_NULL_FORMAT,
serdeConstants.SERIALIZATION_ESCAPE_CRLF,
serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST,
serdeConstants.ESCAPE_CHAR,
serdeConstants.SERIALIZATION_ENCODING,
LazySerDeParameters.SERIALIZATION_EXTEND_NESTING_LEVELS,
LazySerDeParameters.SERIALIZATION_EXTEND_ADDITIONAL_NESTING_LEVELS})
public class MultiDelimitSerDe extends AbstractEncodingAwareSerDe {
private static final byte[] DEFAULT_SEPARATORS = {(byte) 1, (byte) 2, (byte) 3};
// actual delimiter(fieldDelimited) is replaced by REPLACEMENT_DELIM in row.
public static final String REPLACEMENT_DELIM_SEQUENCE = "\1";
public static final int REPLACEMENT_DELIM_LENGTH = REPLACEMENT_DELIM_SEQUENCE.getBytes().length;
private int numColumns;
private String fieldDelimited;
// we don't support using multiple chars as delimiters within complex types
// collection separator
private byte collSep;
// map key separator
private byte keySep;
// The object for storing row data
private LazyStruct cachedLazyStruct;
//the lazy struct object inspector
private ObjectInspector cachedObjectInspector;
// The wrapper for byte array
private ByteArrayRef byteArrayRef;
private LazySerDeParameters serdeParams = null;
// The output stream of serialized objects
private final ByteStream.Output serializeStream = new ByteStream.Output();
// The Writable to return in serialize
private final Text serializeCache = new Text();
@Override
public void initialize(Configuration configuration, Properties tableProperties, Properties partitionProperties)
throws SerDeException {
super.initialize(configuration, tableProperties, partitionProperties);
serdeParams = new LazySerDeParameters(configuration, tableProperties, getClass().getName());
fieldDelimited = properties.getProperty(serdeConstants.FIELD_DELIM);
if (fieldDelimited == null || fieldDelimited.isEmpty()) {
throw new SerDeException("This table does not have serde property \"field.delim\"!");
}
// get the collection separator and map key separator
collSep = LazyUtils.getByte(properties.getProperty(serdeConstants.COLLECTION_DELIM),
DEFAULT_SEPARATORS[1]);
keySep = LazyUtils.getByte(properties.getProperty(serdeConstants.MAPKEY_DELIM),
DEFAULT_SEPARATORS[2]);
serdeParams.setSeparator(1, collSep);
serdeParams.setSeparator(2, keySep);
// Create the ObjectInspectors for the fields
cachedObjectInspector = LazyFactory.createLazyStructInspector(serdeParams
.getColumnNames(), serdeParams.getColumnTypes(), serdeParams
.getSeparators(), serdeParams.getNullSequence(), serdeParams
.isLastColumnTakesRest(), serdeParams.isEscaped(), serdeParams
.getEscapeChar());
cachedLazyStruct = (LazyStruct) LazyFactory.createLazyObject(cachedObjectInspector);
assert serdeParams.getColumnNames().size() == serdeParams.getColumnTypes().size();
numColumns = serdeParams.getColumnNames().size();
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return cachedObjectInspector;
}
@Override
public Class extends Writable> getSerializedClass() {
return Text.class;
}
@Override
public Object doDeserialize(Writable blob) throws SerDeException {
if (byteArrayRef == null) {
byteArrayRef = new ByteArrayRef();
}
// we use the default field delimiter('\1') to replace the multiple-char field delimiter
// but we cannot use it to parse the row since column data can contain '\1' as well
String rowStr;
if (blob instanceof BytesWritable) {
BytesWritable b = (BytesWritable) blob;
rowStr = new String(b.getBytes());
} else if (blob instanceof Text) {
Text rowText = (Text) blob;
rowStr = rowText.toString();
} else {
throw new SerDeException(getClass() + ": expects either BytesWritable or Text object!");
}
// at this point, rowStr is supposed to be encoded with UTF8 (not with the serde's charset)
byteArrayRef.setData(
rowStr.replaceAll(Pattern.quote(fieldDelimited), REPLACEMENT_DELIM_SEQUENCE).getBytes(StandardCharsets.UTF_8));
cachedLazyStruct.init(byteArrayRef, 0, byteArrayRef.getData().length);
// use the multi-char delimiter to parse the lazy struct
cachedLazyStruct.parseMultiDelimit(rowStr.getBytes(StandardCharsets.UTF_8),
fieldDelimited.getBytes(StandardCharsets.UTF_8));
return cachedLazyStruct;
}
@Override
public Writable doSerialize(Object obj, ObjectInspector objInspector)
throws SerDeException {
StructObjectInspector soi = (StructObjectInspector) objInspector;
List extends StructField> fields = soi.getAllStructFieldRefs();
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy