org.apache.hadoop.hive.serde2.thrift.TCTLSeparatedProtocol Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-serde
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.thrift;

import java.io.EOFException;
import java.nio.charset.CharacterCodingException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.io.Text;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TField;
import org.apache.thrift.protocol.TList;
import org.apache.thrift.protocol.TMap;
import org.apache.thrift.protocol.TMessage;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.protocol.TProtocolFactory;
import org.apache.thrift.protocol.TSet;
import org.apache.thrift.protocol.TStruct;
import org.apache.thrift.protocol.TType;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;

/**
 *
 * An implementation of the Thrift Protocol for ctl separated records. This is
 * not thrift compliant in that it doesn't write out field ids so things cannot
 * actually be versioned.
 */
public class TCTLSeparatedProtocol extends TProtocol implements
    ConfigurableTProtocol, WriteNullsProtocol, SkippableTProtocol {

  static final Logger LOG = LoggerFactory.getLogger(TCTLSeparatedProtocol.class
      .getName());

  static byte ORDERED_TYPE = (byte) -1;

  /**
   * Factory for JSON protocol objects.
   */
  public static class Factory implements TProtocolFactory {

    public TProtocol getProtocol(TTransport trans) {
      return new TCTLSeparatedProtocol(trans);
    }

  }

  /**
   * These are defaults, but for now leaving them like this.
   */
  protected static final String defaultPrimarySeparator = "\001";
  protected static final String defaultSecondarySeparator = "\002";
  protected static final String defaultRowSeparator = "\n";
  protected static final String defaultMapSeparator = "\003";

  /**
   * The separators for this instance.
   */
  protected String primarySeparator;
  protected String secondarySeparator;
  protected String rowSeparator;
  protected String mapSeparator;
  protected Pattern primaryPattern;
  protected Pattern secondaryPattern;
  protected Pattern mapPattern;

  /**
   * The quote character when supporting quotes with ability to not split across
   * quoted entries. Like csv. Note that escaping the quote is not currently
   * supported.
   */
  protected String quote;

  /**
   * Inspect the separators this instance is configured with.
   */
  public String getPrimarySeparator() {
    return primarySeparator;
  }

  public String getSecondarySeparator() {
    return secondarySeparator;
  }

  public String getRowSeparator() {
    return rowSeparator;
  }

  public String getMapSeparator() {
    return mapSeparator;
  }

  /**
   * The transport stream is tokenized on the row separator.
   */
  protected SimpleTransportTokenizer transportTokenizer;

  /**
   * For a single row, the split on the primary separator.
   */
  protected String[] columns;

  /**
   * An index into what column we're on.
   */

  protected int index;

  /**
   * For a single column, a split on the secondary separator.
   */
  protected String[] fields;

  /**
   * An index into what field within a column we're on.
   */
  protected int innerIndex;

  /**
   * Is this the first field we're writing.
   */
  protected boolean firstField;

  /**
   * Is this the first list/map/set field we're writing for the current element.
   */
  protected boolean firstInnerField;

  /**
   * Are we writing a map and need to worry about k/v separator?
   */
  protected boolean isMap;

  /**
   * For writes, on what element are we on so we know when to use normal list
   * separator or for a map know when to use the k/v separator.
   */
  protected long elemIndex;

  /**
   * Are we currently on the top-level columns or parsing a column itself.
   */
  protected boolean inner;

  /**
   * For places where the separators are back to back, should we return a null
   * or an empty string since it is ambiguous. This also applies to extra
   * columns that are read but aren't in the current record.
   */
  protected boolean returnNulls;

  /**
   * The transport being wrapped.
   *
   */
  protected final TTransport innerTransport;

  /**
   * Strings used to lookup the various configurable parameters of this
   * protocol.
   */
  public static final String ReturnNullsKey = "separators.return_nulls";
  public static final String BufferSizeKey = "separators.buffer_size";

  /**
   * The size of the internal buffer to use.
   */
  protected int bufferSize;

  /**
   * The string representing nulls in the serialized data. e.g., \N as in mysql.
   */
  protected String nullString;

  /**
   * The nullString in UTF-8 bytes.
   */
  protected Text nullText;

  /**
   * A convenience class for tokenizing a TTransport.
   */

  class SimpleTransportTokenizer {

    TTransport trans;
    StringTokenizer tokenizer;
    final String separator;
    byte[] buf;

    public SimpleTransportTokenizer(TTransport trans, String separator,
        int buffer_length) {
      this.trans = trans;
      this.separator = separator;
      buf = new byte[buffer_length];
    }

    private void initialize() {
      // do not fill tokenizer until user requests since filling it could read
      // in data
      // not meant for this instantiation.
      try {
        fillTokenizer();
      } catch (Exception e) {
        LOG.warn("Unable to initialize tokenizer", e);
      }
    }

    private boolean fillTokenizer() {
      try {
        int length = trans.read(buf, 0, buf.length);
        if (length <= 0) {
          tokenizer = new StringTokenizer("", separator, true);
          return false;
        }
        String row;
        try {
          row = Text.decode(buf, 0, length);
        } catch (CharacterCodingException e) {
          throw new RuntimeException(e);
        }
        tokenizer = new StringTokenizer(row, separator, true);
      } catch (TTransportException e) {
        if(e.getType() == TTransportException.END_OF_FILE){
          tokenizer = new StringTokenizer("", separator, true);
          return false;
        }
        tokenizer = null;
        throw new RuntimeException(e);
      }
      return true;
    }

    public String nextToken() throws EOFException {
      StringBuilder ret = null;
      boolean done = false;

      if (tokenizer == null) {
        fillTokenizer();
      }

      while (!done) {

        if (!tokenizer.hasMoreTokens()) {
          if (!fillTokenizer()) {
            break;
          }
        }
        try {
          final String nextToken = tokenizer.nextToken();

          if (nextToken.equals(separator)) {
            done = true;
          } else if (ret == null) {
            ret = new StringBuilder(nextToken);
          } else {
            ret.append(nextToken);
          }
        } catch (NoSuchElementException e) {
          if (ret == null) {
            throw new EOFException(e.getMessage());
          }
          done = true;
        }
      } // while ! done
      final String theRet = ret == null ? null : ret.toString();
      return theRet;
    }
  };

  /**
   * The simple constructor which assumes ctl-a, ctl-b and '\n' separators and
   * to return empty strings for empty fields.
   *
   * @param trans
   *          - the ttransport to use as input or output
   *
   */

  public TCTLSeparatedProtocol(TTransport trans) {
    this(trans, defaultPrimarySeparator, defaultSecondarySeparator,
        defaultMapSeparator, defaultRowSeparator, true, 4096);
  }

  @Override
  public int getMinSerializedSize(byte b) throws TException {
    return -1;
  }

  public TCTLSeparatedProtocol(TTransport trans, int buffer_size) {
    this(trans, defaultPrimarySeparator, defaultSecondarySeparator,
        defaultMapSeparator, defaultRowSeparator, true, buffer_size);
  }

  /**
   * @param trans
   *          - the ttransport to use as input or output
   * @param primarySeparator
   *          the separator between columns (aka fields)
   * @param secondarySeparator
   *          the separator within a field for things like sets and maps and
   *          lists
   * @param mapSeparator
   *          - the key/value separator
   * @param rowSeparator
   *          - the record separator
   * @param returnNulls
   *          - whether to return a null or an empty string for fields that seem
   *          empty (ie two primary separators back to back)
   */

  public TCTLSeparatedProtocol(TTransport trans, String primarySeparator,
      String secondarySeparator, String mapSeparator, String rowSeparator,
      boolean returnNulls, int bufferSize) {
    super(trans);

    this.returnNulls = returnNulls;

    this.primarySeparator = primarySeparator;
    this.secondarySeparator = secondarySeparator;
    this.rowSeparator = rowSeparator;
    this.mapSeparator = mapSeparator;

    innerTransport = trans;
    this.bufferSize = bufferSize;
    nullString = "\\N";
  }

  /**
   * Sets the internal separator patterns and creates the internal tokenizer.
   */
  protected void internalInitialize() {

    // in the future could allow users to specify a quote character that doesn't
    // need escaping but for now ...
    final String primaryPatternString = quote == null ? primarySeparator
        : "(?:^|" + primarySeparator + ")(" + quote + "(?:[^" + quote + "]+|"
        + quote + quote + ")*" + quote + "|[^" + primarySeparator + "]*)";

    if (quote != null) {
      stripSeparatorPrefix = Pattern.compile("^" + primarySeparator);
      stripQuotePrefix = Pattern.compile("^" + quote);
      stripQuotePostfix = Pattern.compile(quote + "$");
    }

    primaryPattern = Pattern.compile(primaryPatternString);
    secondaryPattern = Pattern.compile(secondarySeparator);
    mapPattern = Pattern.compile(secondarySeparator + "|" + mapSeparator);
    nullText = new Text(nullString);
    transportTokenizer = new SimpleTransportTokenizer(innerTransport,
        rowSeparator, bufferSize);
    transportTokenizer.initialize();
  }

  /**
   * For quoted fields, strip away the quotes and also need something to strip
   * away the control separator when using complex split method defined here.
   */
  protected Pattern stripSeparatorPrefix;
  protected Pattern stripQuotePrefix;
  protected Pattern stripQuotePostfix;

  /**
   *
   * Split the line based on a complex regex pattern.
   *
   * @param line
   *          the current row
   * @param p
   *          the pattern for matching fields in the row
   * @return List of Strings - not including the separator in them
   */
  protected String[] complexSplit(String line, Pattern p) {

    ArrayList list = new ArrayList();
    Matcher m = p.matcher(line);
    // For each field
    while (m.find()) {
      String match = m.group();
      if (match == null) {
        break;
      }
      if (match.length() == 0) {
        match = null;
      } else {
        if (stripSeparatorPrefix.matcher(match).find()) {
          match = match.substring(1);
        }
        if (stripQuotePrefix.matcher(match).find()) {
          match = match.substring(1);
        }
        if (stripQuotePostfix.matcher(match).find()) {
          match = match.substring(0, match.length() - 1);
        }
      }
      list.add(match);
    }
    return list.toArray(new String[1]);
  }

  protected String getByteValue(String altValue, String defaultVal) {
    if (altValue != null && altValue.length() > 0) {
      try {
        byte[] b = new byte[1];
        b[0] = Byte.parseByte(altValue);
        return new String(b);
      } catch (NumberFormatException e) {
        return altValue;
      }
    }
    return defaultVal;
  }

  /**
   * Initialize the TProtocol.
   *
   * @param conf
   *          System properties
   * @param tbl
   *          table properties
   * @throws TException
   */
  public void initialize(Configuration conf, Properties tbl) throws TException {

    primarySeparator = getByteValue(tbl.getProperty(serdeConstants.FIELD_DELIM),
        primarySeparator);
    secondarySeparator = getByteValue(tbl
        .getProperty(serdeConstants.COLLECTION_DELIM), secondarySeparator);
    rowSeparator = getByteValue(tbl.getProperty(serdeConstants.LINE_DELIM),
        rowSeparator);
    mapSeparator = getByteValue(tbl.getProperty(serdeConstants.MAPKEY_DELIM),
        mapSeparator);
    returnNulls = Boolean.parseBoolean(
        tbl.getProperty(ReturnNullsKey, String.valueOf(returnNulls)));
    bufferSize = Integer.parseInt(
        tbl.getProperty(BufferSizeKey, String.valueOf(bufferSize)));
    nullString = tbl.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT, "\\N");
    quote = tbl.getProperty(serdeConstants.QUOTE_CHAR, null);

    internalInitialize();

  }

  @Override
  public void writeMessageBegin(TMessage message) throws TException {
  }

  @Override
  public void writeMessageEnd() throws TException {
  }

  @Override
  public void writeStructBegin(TStruct struct) throws TException {
    firstField = true;
  }

  @Override
  public void writeStructEnd() throws TException {
    // We don't write rowSeparatorByte because that should be handled by file
    // format.
  }

  @Override
  public void writeFieldBegin(TField field) throws TException {
    if (!firstField) {
      internalWriteString(primarySeparator);
    }
    firstField = false;
  }

  @Override
  public void writeFieldEnd() throws TException {
  }

  @Override
  public void writeFieldStop() {
  }

  @Override
  public void writeMapBegin(TMap map) throws TException {
    // nesting not allowed!
    if (map.keyType == TType.STRUCT || map.keyType == TType.MAP
        || map.keyType == TType.LIST || map.keyType == TType.SET) {
      throw new TException("Not implemented: nested structures");
    }
    // nesting not allowed!
    if (map.valueType == TType.STRUCT || map.valueType == TType.MAP
        || map.valueType == TType.LIST || map.valueType == TType.SET) {
      throw new TException("Not implemented: nested structures");
    }

    firstInnerField = true;
    isMap = true;
    inner = true;
    elemIndex = 0;
  }

  @Override
  public void writeMapEnd() throws TException {
    isMap = false;
    inner = false;
  }

  @Override
  public void writeListBegin(TList list) throws TException {
    if (list.elemType == TType.STRUCT || list.elemType == TType.MAP
        || list.elemType == TType.LIST || list.elemType == TType.SET) {
      throw new TException("Not implemented: nested structures");
    }
    firstInnerField = true;
    inner = true;
  }

  @Override
  public void writeListEnd() throws TException {
    inner = false;
  }

  @Override
  public void writeSetBegin(TSet set) throws TException {
    if (set.elemType == TType.STRUCT || set.elemType == TType.MAP
        || set.elemType == TType.LIST || set.elemType == TType.SET) {
      throw new TException("Not implemented: nested structures");
    }
    firstInnerField = true;
    inner = true;
  }

  @Override
  public void writeSetEnd() throws TException {
    inner = false;
  }

  @Override
  public void writeBool(boolean b) throws TException {
    writeString(String.valueOf(b));
  }

  // for writing out single byte
  private final byte[] buf = new byte[1];

  @Override
  public void writeByte(byte b) throws TException {
    buf[0] = b;
    trans_.write(buf);
  }

  @Override
  public void writeI16(short i16) throws TException {
    writeString(String.valueOf(i16));
  }

  @Override
  public void writeI32(int i32) throws TException {
    writeString(String.valueOf(i32));
  }

  @Override
  public void writeI64(long i64) throws TException {
    writeString(String.valueOf(i64));
  }

  @Override
  public void writeDouble(double dub) throws TException {
    writeString(String.valueOf(dub));
  }

  Text tmpText = new Text();

  public void internalWriteString(String str) throws TException {
    if (str != null) {
      tmpText.set(str);
      trans_.write(tmpText.getBytes(), 0, tmpText.getLength());
    } else {
      trans_.write(nullText.getBytes(), 0, nullText.getLength());
    }
  }

  @Override
  public void writeString(String str) throws TException {
    if (inner) {
      if (!firstInnerField) {
        // super hack city notice the mod plus only happens after firstfield
        // hit, so == 0 is right.
        if (isMap && elemIndex++ % 2 == 0) {
          internalWriteString(mapSeparator);
        } else {
          internalWriteString(secondarySeparator);
        }
      } else {
        firstInnerField = false;
      }
    }
    internalWriteString(str);
  }

  @Override
  public void writeBinary(ByteBuffer bin) throws TException {
    throw new TException(
        "Ctl separated protocol cannot support writing Binary data!");
  }

  @Override
  public TMessage readMessageBegin() throws TException {
    return new TMessage();
  }

  @Override
  public void readMessageEnd() throws TException {
  }

  @Override
  public TStruct readStructBegin() throws TException {
    assert (!inner);
    try {
      final String tmp = transportTokenizer.nextToken();
      columns = quote == null ? primaryPattern.split(tmp) : complexSplit(tmp,
          primaryPattern);
      index = 0;
      return new TStruct();
    } catch (EOFException e) {
      return null;
    }
  }

  @Override
  public void readStructEnd() throws TException {
    columns = null;
  }

  /**
   * Skip past the current field Just increments the field index counter.
   */
  public void skip(byte type) {
    if (inner) {
      innerIndex++;
    } else {
      index++;
    }
  }

  @Override
  public TField readFieldBegin() throws TException {
    assert (!inner);
    TField f = new TField("", ORDERED_TYPE, (short) -1);
    // slight hack to communicate to SerDes that the field ids are not
    // being set but things are ordered.
    return f;
  }

  @Override
  public void readFieldEnd() throws TException {
    fields = null;
  }

  @Override
  public TMap readMapBegin() throws TException {
    assert (!inner);
    TMap map = new TMap();
    if (columns[index] == null || columns[index].equals(nullString)) {
      index++;
      if (returnNulls) {
        return null;
      }
    } else if (columns[index].isEmpty()) {
      index++;
    } else {
      fields = mapPattern.split(columns[index++]);
      map = new TMap(ORDERED_TYPE, ORDERED_TYPE, fields.length / 2);
    }
    innerIndex = 0;
    inner = true;
    isMap = true;
    return map;
  }

  @Override
  public void readMapEnd() throws TException {
    inner = false;
    isMap = false;
  }

  @Override
  public TList readListBegin() throws TException {
    assert (!inner);
    TList list = new TList();
    if (columns[index] == null || columns[index].equals(nullString)) {
      index++;
      if (returnNulls) {
        return null;
      }
    } else if (columns[index].isEmpty()) {
      index++;
    } else {
      fields = secondaryPattern.split(columns[index++]);
      list = new TList(ORDERED_TYPE, fields.length);
    }
    innerIndex = 0;
    inner = true;
    return list;
  }

  @Override
  public void readListEnd() throws TException {
    inner = false;
  }

  @Override
  public TSet readSetBegin() throws TException {
    assert (!inner);
    TSet set = new TSet();
    if (columns[index] == null || columns[index].equals(nullString)) {
      index++;
      if (returnNulls) {
        return null;
      }
    } else if (columns[index].isEmpty()) {
      index++;
    } else {
      fields = secondaryPattern.split(columns[index++]);
      set = new TSet(ORDERED_TYPE, fields.length);
    }
    inner = true;
    innerIndex = 0;
    return set;
  }

  protected boolean lastPrimitiveWasNullFlag;

  public boolean lastPrimitiveWasNull() throws TException {
    return lastPrimitiveWasNullFlag;
  }

  public void writeNull() throws TException {
    writeString(null);
  }

  @Override
  public void readSetEnd() throws TException {
    inner = false;
  }

  @Override
  public boolean readBool() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    return val == null || val.isEmpty() ? false : Boolean.parseBoolean(val);
  }

  @Override
  public byte readByte() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    try {
      return val == null || val.isEmpty() ? 0 : Byte.parseByte(val);
    } catch (NumberFormatException e) {
      lastPrimitiveWasNullFlag = true;
      return 0;
    }
  }

  @Override
  public short readI16() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    try {
      return val == null || val.isEmpty() ? 0 : Short.parseShort(val);
    } catch (NumberFormatException e) {
      lastPrimitiveWasNullFlag = true;
      return 0;
    }
  }

  @Override
  public int readI32() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    try {
      return val == null || val.isEmpty() ? 0 : Integer.parseInt(val);
    } catch (NumberFormatException e) {
      lastPrimitiveWasNullFlag = true;
      return 0;
    }
  }

  @Override
  public long readI64() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    try {
      return val == null || val.isEmpty() ? 0 : Long.parseLong(val);
    } catch (NumberFormatException e) {
      lastPrimitiveWasNullFlag = true;
      return 0;
    }
  }

  @Override
  public double readDouble() throws TException {
    String val = readString();
    lastPrimitiveWasNullFlag = val == null;
    try {
      return val == null || val.isEmpty() ? 0 : Double.parseDouble(val);
    } catch (NumberFormatException e) {
      lastPrimitiveWasNullFlag = true;
      return 0;
    }
  }

  @Override
  public String readString() throws TException {
    String ret;
    if (!inner) {
      ret = columns != null && index < columns.length ? columns[index] : null;
      index++;
    } else {
      ret = fields != null && innerIndex < fields.length ? fields[innerIndex]
          : null;
      innerIndex++;
    }
    if (ret == null || ret.equals(nullString)) {
      return returnNulls ? null : "";
    } else {
      return ret;
    }
  }

  @Override
  public ByteBuffer readBinary() throws TException {
    throw new TException("Not implemented for control separated data");
  }
}