All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.exec.vector.expressions.StringExpr Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector.expressions;

import java.util.Arrays;

import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;

/**
 * String expression evaluation helper functions.
 */
public class StringExpr {

  /* Compare two strings from two byte arrays each
   * with their own start position and length.
   * Use lexicographic unsigned byte value order.
   * This is what's used for UTF-8 sort order.
   * Return negative value if arg1 < arg2, 0 if arg1 = arg2,
   * positive if arg1 > arg2.
   */
  public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
    for (int i = 0; i < len1 && i < len2; i++) {
      // Note the "& 0xff" is just a way to convert unsigned bytes to signed integer.
      int b1 = arg1[i + start1] & 0xff;
      int b2 = arg2[i + start2] & 0xff;
      if (b1 != b2) {
        return b1 - b2;
      }
    }
    return len1 - len2;
  }

  public static int characterCount(byte[] bytes) {
    int end = bytes.length;

    // count characters
    int j = 0;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        ++charCount;
      }
      j++;
    }
    return charCount;
  }

  public static int characterCount(byte[] bytes, int start, int length) {
    int end = start + length;

    // count characters
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        ++charCount;
      }
      j++;
    }
    return charCount;
  }

  // A setVal with the same function signature as rightTrim, leftTrim, truncate, etc, below.
  // Useful for class generation via templates.
  public static void assign(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
    // set output vector
    outV.setVal(i, bytes, start, length);
  }

  /*
   * Right trim a slice of a byte array and return the new byte length.
   */
  public static int rightTrim(byte[] bytes, int start, int length) {
    // skip trailing blank characters
    int j = start + length - 1;
    while(j >= start && bytes[j] == 0x20) {
      j--;
    }

    return (j - start) + 1;
  }

  /*
   * Right trim a slice of a byte array and place the result into element i of a vector.
   */
  public static void rightTrim(BytesColumnVector outV, int i, byte[] bytes, int start, int length) {
    // skip trailing blank characters
    int j = start + length - 1;
    while(j >= start && bytes[j] == 0x20) {
      j--;
    }

    // set output vector
    outV.setVal(i, bytes, start, (j - start) + 1);
  }

  /*
   * Truncate a slice of a byte array to a maximum number of characters and
   * return the new byte length.
   */
  public static int truncate(byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }
    return (j - start);
  }

  /*
   * Truncate a slice of a byte array to a maximum number of characters and
   * place the result into element i of a vector.
   */
  public static void truncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward
    int j = start;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }

    // set output vector
    outV.setVal(i, bytes, start, (j - start));
  }

  /*
   * Truncate a byte array to a maximum number of characters and
   * return a byte array with only truncated bytes.
   */
  public static byte[] truncateScalar(byte[] bytes, int maxLength) {
    int end = bytes.length;

    // count characters forward
    int j = 0;
    int charCount = 0;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        ++charCount;
      }
      j++;
    }
    if (j == end) {
      return bytes;
    } else {
      return Arrays.copyOf(bytes, j);
    }
  }

  /*
   * Right trim and truncate a slice of a byte array to a maximum number of characters and
   * return the new byte length.
   */
  public static int rightTrimAndTruncate(byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward and watch for final run of pads
    int j = start;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    if (padRunStart != -1) {
      return (padRunStart - start);
    } else {
      return (j - start);
    }
  }

  /*
   * Right trim and truncate a slice of a byte array to a maximum number of characters and
   * place the result into element i of a vector.
   */
  public static void rightTrimAndTruncate(BytesColumnVector outV, int i, byte[] bytes, int start, int length, int maxLength) {
    int end = start + length;

    // count characters forward and watch for final run of pads
    int j = start;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    // set output vector
    if (padRunStart != -1) {
      outV.setVal(i, bytes, start, (padRunStart - start));
    } else {
      outV.setVal(i, bytes, start, (j - start) );
    }
  }

  /*
   * Right trim and truncate a byte array to a maximum number of characters and
   * return a byte array with only the trimmed and truncated bytes.
   */
  public static byte[] rightTrimAndTruncateScalar(byte[] bytes, int maxLength) {
    int end = bytes.length;

    // count characters forward and watch for final run of pads
    int j = 0;
    int charCount = 0;
    int padRunStart = -1;
    while(j < end) {
      // UTF-8 continuation bytes have 2 high bits equal to 0x80.
      if ((bytes[j] & 0xc0) != 0x80) {
        if (charCount == maxLength) {
          break;
        }
        if (bytes[j] == 0x20) {
          if (padRunStart == -1) {
            padRunStart = j;
          }
        } else {
          padRunStart = -1;
        }
        ++charCount;
      } else {
        padRunStart = -1;
      }
      j++;
    }
    if (padRunStart != -1) {
      return Arrays.copyOf(bytes, padRunStart);
    } else if (j == end) {
      return bytes;
    } else {
      return Arrays.copyOf(bytes, j);
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy