All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.exec.vector.expressions.AbstractFilterStringColLikeStringScalar Maven / Gradle / Ivy

There is a newer version: 4.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.exec.vector.expressions;

import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;

/**
 * An abstract class for LIKE and REGEXP expressions. LIKE and REGEXP expression share similar
 * functions, but they have different grammars. AbstractFilterStringColLikeStringScalar class
 * provides shared classes and methods. Each subclass handles its grammar.
 */
public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression {
  private static final long serialVersionUID = 1L;

  private int colNum;
  private String pattern;
  transient Checker checker = null;

  public AbstractFilterStringColLikeStringScalar() {
    super();
  }

  public AbstractFilterStringColLikeStringScalar(int colNum, String pattern) {
    this.colNum = colNum;
    this.pattern = pattern;
  }

  protected abstract List getCheckerFactories();

  /**
   * Selects an optimized checker for a given string.
   * @param pattern
   * @return
   */
  private Checker createChecker(String pattern) {
    for (CheckerFactory checkerFactory : getCheckerFactories()) {
      Checker checker = checkerFactory.tryCreate(pattern);
      if (checker != null) {
        return checker;
      }
    }
    return null;
  }

  @Override
  public void evaluate(VectorizedRowBatch batch) {

    if (checker == null) {
      checker = createChecker(pattern);
    }

    if (childExpressions != null) {
      super.evaluateChildren(batch);
    }

    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
    int[] sel = batch.selected;
    boolean[] nullPos = inputColVector.isNull;
    int n = batch.size;
    byte[][] vector = inputColVector.vector;
    int[] length = inputColVector.length;
    int[] start = inputColVector.start;

    // return immediately if batch is empty
    if (n == 0) {
      return;
    }

    if (inputColVector.noNulls) {
      if (inputColVector.isRepeating) {

        // All must be selected otherwise size would be zero Repeating property will not change.
        if (!checker.check(vector[0], start[0], length[0])) {

          // Entire batch is filtered out.
          batch.size = 0;
        }
      } else if (batch.selectedInUse) {
        int newSize = 0;

        for (int j = 0; j != n; j++) {
          int i = sel[j];
          if (checker.check(vector[i], start[i], length[i])) {
            sel[newSize++] = i;
          }
        }

        batch.size = newSize;
      } else {
        int newSize = 0;
        for (int i = 0; i != n; i++) {
          if (checker.check(vector[i], start[i], length[i])) {
            sel[newSize++] = i;
          }
        }

        if (newSize < n) {
          batch.size = newSize;
          batch.selectedInUse = true;
        }
      }
    } else {
      if (inputColVector.isRepeating) {

        //All must be selected otherwise size would be zero. Repeating property will not change.
        if (!nullPos[0]) {
          if (!checker.check(vector[0], start[0], length[0])) {

            //Entire batch is filtered out.
            batch.size = 0;
          }
        } else {
          batch.size = 0;
        }
      } else if (batch.selectedInUse) {
        int newSize = 0;

        for (int j = 0; j != n; j++) {
          int i = sel[j];
          if (!nullPos[i]) {
            if (checker.check(vector[i], start[i], length[i])) {
              sel[newSize++] = i;
            }
          }
        }

        //Change the selected vector
        batch.size = newSize;
      } else {
        int newSize = 0;

        for (int i = 0; i != n; i++) {
          if (!nullPos[i]) {
            if (checker.check(vector[i], start[i], length[i])) {
              sel[newSize++] = i;
            }
          }
        }

        if (newSize < n) {
          batch.size = newSize;
          batch.selectedInUse = true;
        }

        /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
         * future operations. So selectedInUse will remain false.
         */
      }
    }
  }

  @Override
  public int getOutputColumn() {
    return -1;
  }

  @Override
  public String getOutputType() {
    return "boolean";
  }

  /**
   * A Checker contains a pattern and checks whether a given string matches or not.
   */
  public interface Checker {
    /**
     * Checks whether the given string matches with its pattern.
     * @param byteS The byte array that contains the string
     * @param start The start position of the string
     * @param len The length of the string
     * @return Whether it matches or not.
     */
    boolean check(byte[] byteS, int start, int len);
  }

  /**
   * A CheckerFactory creates checkers of its kind.
   */
  protected interface CheckerFactory {
    /**
     * If the given pattern is acceptable for its checker class, it creates and returns a checker.
     * Otherwise, it returns null.
     * @param pattern
     * @return If the pattern is acceptable, a Checker object. Otherwise
     * null.
     */
    Checker tryCreate(String pattern);
  }

  /**
   * Matches the whole string to its pattern.
   */
  protected static class NoneChecker implements Checker {
    byte [] byteSub;

    NoneChecker(String pattern) {
      try {
        byteSub = pattern.getBytes("UTF-8");
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
      }
    }

    public boolean check(byte[] byteS, int start, int len) {
      int lenSub = byteSub.length;
      if (len != lenSub) {
        return false;
      }
      for (int i = start, j = 0; j < len; i++, j++) {
        if (byteS[i] != byteSub[j]) {
          return false;
        }
      }
      return true;
    }
  }

  /**
   * Matches the beginning of each string to a pattern.
   */
  protected static class BeginChecker implements Checker {
    byte[] byteSub;

    BeginChecker(String pattern) {
      try {
        byteSub = pattern.getBytes("UTF-8");
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
      }
    }

    public boolean check(byte[] byteS, int start, int len) {
      if (len < byteSub.length) {
        return false;
      }
      for (int i = start, j = 0; j < byteSub.length; i++, j++) {
        if (byteS[i] != byteSub[j]) {
          return false;
        }
      }
      return true;
    }
  }

  /**
   * Matches the ending of each string to its pattern.
   */
  protected static class EndChecker implements Checker {
    byte[] byteSub;
    EndChecker(String pattern) {
      try {
        byteSub = pattern.getBytes("UTF-8");
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
      }
    }

    public boolean check(byte[] byteS, int start, int len) {
      int lenSub = byteSub.length;
      if (len < lenSub) {
        return false;
      }
      for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) {
        if (byteS[i] != byteSub[j]) {
          return false;
        }
      }
      return true;
    }
  }

  /**
   * Matches the middle of each string to its pattern.
   */
  protected static class MiddleChecker implements Checker {
    byte[] byteSub;
    int lenSub;

    MiddleChecker(String pattern) {
      try {
        byteSub = pattern.getBytes("UTF-8");
        lenSub = byteSub.length;
      } catch (UnsupportedEncodingException e) {
        throw new RuntimeException(e);
      }
    }

    public boolean check(byte[] byteS, int start, int len) {
      if (len < lenSub) {
        return false;
      }
      int end = start + len - lenSub + 1;
      boolean match = false;
      for (int i = start; i < end; i++) {
        match = true;
        for (int j = 0; j < lenSub; j++) {
          if (byteS[i + j] != byteSub[j]) {
            match = false;
            break;
          }
        }
        if (match) {
          return true;
        }
      }
      return match;
    }
  }

  /**
   * Matches each string to a pattern with Java regular expression package.
   */
  protected static class ComplexChecker implements Checker {
    Pattern compiledPattern;
    Matcher matcher;
    FastUTF8Decoder decoder;

    ComplexChecker(String pattern) {
      compiledPattern = Pattern.compile(pattern);
      matcher = compiledPattern.matcher("");
      decoder = new FastUTF8Decoder();
    }

    public boolean check(byte[] byteS, int start, int len) {
      // Match the given bytes with the like pattern
      matcher.reset(decoder.decodeUnsafely(byteS, start, len));
      return matcher.find(0);
    }
  }

  /**
   * A fast UTF-8 decoder that caches necessary objects for decoding.
   */
  private static class FastUTF8Decoder {
    CharsetDecoder decoder;
    ByteBuffer byteBuffer;
    CharBuffer charBuffer;

    public FastUTF8Decoder() {
      decoder = Charset.forName("UTF-8").newDecoder()
          .onMalformedInput(CodingErrorAction.REPLACE)
          .onUnmappableCharacter(CodingErrorAction.REPLACE);
      byteBuffer = ByteBuffer.allocate(4);
      charBuffer = CharBuffer.allocate(4);
    }

    public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) {
      // Prepare buffers
      if (byteBuffer.capacity() < len) {
        byteBuffer = ByteBuffer.allocate(len * 2);
      }
      byteBuffer.clear();
      byteBuffer.put(byteS, start, len);
      byteBuffer.flip();

      int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
      if (charBuffer.capacity() < maxChars) {
        charBuffer = CharBuffer.allocate(maxChars);
      }
      charBuffer.clear();

      // Decode UTF-8
      decoder.reset();
      decoder.decode(byteBuffer, charBuffer, true);
      decoder.flush(charBuffer);
      charBuffer.flip();

      return charBuffer;
    }
  }

  public int getColNum() {
    return colNum;
  }

  public void setColNum(int colNum) {
    this.colNum = colNum;
  }

  public String getPattern() {
    return pattern;
  }

  public void setPattern(String pattern) {
    this.pattern = pattern;
  }

  @Override
  public VectorExpressionDescriptor.Descriptor getDescriptor() {
    return (new VectorExpressionDescriptor.Builder())
        .setMode(
            VectorExpressionDescriptor.Mode.FILTER)
        .setNumArguments(2)
        .setArgumentTypes(
            VectorExpressionDescriptor.ArgumentType.STRING_FAMILY,
            VectorExpressionDescriptor.ArgumentType.STRING_FAMILY)
        .setInputExpressionTypes(
            VectorExpressionDescriptor.InputExpressionType.COLUMN,
            VectorExpressionDescriptor.InputExpressionType.SCALAR).build();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy