All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.orc.impl.mask.RedactMaskFactory Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.orc.impl.mask;

import io.prestosql.hive.$internal.org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.Text;
import org.apache.orc.DataMask;
import org.apache.orc.TypeDescription;

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Map;
import java.util.SortedMap;
import java.util.TimeZone;
import java.util.TreeMap;
import java.util.concurrent.TimeUnit;

/**
 * Masking strategy that hides most string and numeric values based on unicode
 * character categories.
 *
 * Masking Parameters:
 *   character replacements: string of 10 characters one per group below
 *     letter, upper case (default X)
 *     letter, lower case (default x)
 *     number, digit      (default 9)
 *     symbol             (default $)
 *     punctuation        (default .)
 *     separator          (default no masking)
 *     letter, other      (default ª)
 *     mark               (default ः)
 *     number, other      (default ²)
 *     other              (default ۝)
 *
 *   time replacements: string of 6 numbers or _ one per field below
 *     year (0 to 4000, default no masking)
 *     month (1 to 12, default 1)
 *     date (1 to 31, default 1)
 *     hour (0 to 23, default 0)
 *     minute (0 to 59, default 0)
 *     second (0 to 59, default 0)
 *
 * Parameters use "_" for preserve original.
 */
public class RedactMaskFactory extends MaskFactory {

  /**
   * The value to indicate that the value should be preserved.
   */
  private static final int UNMASKED_CHAR = "_".codePointAt(0);
  private static final int UNMASKED_DATE = -1;

  // The default replacements for each character category.
  // I picked a character in the same category so that the masking is
  // idempotent. For non-ascii characters, I mostly picked the first example.
  private static final int DEFAULT_LETTER_UPPER = "X".codePointAt(0);
  private static final int DEFAULT_LETTER_LOWER = "x".codePointAt(0);
  private static final int DEFAULT_NUMBER_DIGIT = 9;
  private static final int DEFAULT_NUMBER_DIGIT_CP =
      Integer.toString(DEFAULT_NUMBER_DIGIT).codePointAt(0);
  private static final int DEFAULT_SYMBOL = "$".codePointAt(0);
  private static final int DEFAULT_PUNCTUATION = ".".codePointAt(0);
  private static final int DEFAULT_SEPARATOR = UNMASKED_CHAR;
  private static final int DEFAULT_LETTER_OTHER = "\u00AA".codePointAt(0);
  private static final int DEFAULT_MARK = "\u0903".codePointAt(0);
  private static final int DEFAULT_NUMBER_OTHER = "\u00B2".codePointAt(0);
  private static final int DEFAULT_OTHER = "\u06DD".codePointAt(0);

  // The replacement codepoint for each character category. We use codepoints
  // here so that we don't have to worry about handling long UTF characters
  // as special cases.
  private final int UPPPER_REPLACEMENT;
  private final int LOWER_REPLACEMENT;
  private final int OTHER_LETTER_REPLACEMENT;
  private final int MARK_REPLACEMENT;
  private final int DIGIT_CP_REPLACEMENT;
  private final int OTHER_NUMBER_REPLACEMENT;
  private final int SYMBOL_REPLACEMENT;
  private final int PUNCTUATION_REPLACEMENT;
  private final int SEPARATOR_REPLACEMENT;
  private final int OTHER_REPLACEMENT;

  // numeric replacement
  private final int DIGIT_REPLACEMENT;

  // time replacement
  private final int YEAR_REPLACEMENT;
  private final int MONTH_REPLACEMENT;
  private final int DATE_REPLACEMENT;
  private final int HOUR_REPLACEMENT;
  private final int MINUTE_REPLACEMENT;
  private final int SECOND_REPLACEMENT;
  private final boolean maskDate;
  private final boolean maskTimestamp;

  // index tuples that are not to be masked
  private final SortedMap unmaskIndexRanges = new TreeMap<>();

  public RedactMaskFactory(String... params) {
    ByteBuffer param = params.length < 1 ? ByteBuffer.allocate(0) :
        ByteBuffer.wrap(params[0].getBytes(StandardCharsets.UTF_8));
    UPPPER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_UPPER);
    LOWER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_LOWER);
    DIGIT_CP_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_DIGIT_CP);
    DIGIT_REPLACEMENT = getReplacementDigit(DIGIT_CP_REPLACEMENT);
    SYMBOL_REPLACEMENT = getNextCodepoint(param, DEFAULT_SYMBOL);
    PUNCTUATION_REPLACEMENT = getNextCodepoint(param, DEFAULT_PUNCTUATION);
    SEPARATOR_REPLACEMENT = getNextCodepoint(param, DEFAULT_SEPARATOR);
    OTHER_LETTER_REPLACEMENT = getNextCodepoint(param, DEFAULT_LETTER_OTHER);
    MARK_REPLACEMENT = getNextCodepoint(param, DEFAULT_MARK);
    OTHER_NUMBER_REPLACEMENT = getNextCodepoint(param, DEFAULT_NUMBER_OTHER);
    OTHER_REPLACEMENT = getNextCodepoint(param, DEFAULT_OTHER);
    String[] timeParams;
    if (params.length < 2 || StringUtils.isBlank(params[1])) {
      timeParams = null;
    } else {
      timeParams = params[1].split("\\W+");
    }
    YEAR_REPLACEMENT = getDateParam(timeParams, 0, UNMASKED_DATE, 4000);
    MONTH_REPLACEMENT = getDateParam(timeParams, 1, 1, 12);
    DATE_REPLACEMENT = getDateParam(timeParams, 2, 1, 31);
    HOUR_REPLACEMENT = getDateParam(timeParams, 3, 0, 23);
    MINUTE_REPLACEMENT = getDateParam(timeParams, 4, 0, 59);
    SECOND_REPLACEMENT = getDateParam(timeParams, 5, 0, 59);
    maskDate = (YEAR_REPLACEMENT != UNMASKED_DATE) ||
        (MONTH_REPLACEMENT != UNMASKED_DATE) ||
        (DATE_REPLACEMENT != UNMASKED_DATE);
    maskTimestamp = maskDate || (HOUR_REPLACEMENT != UNMASKED_DATE) ||
        (MINUTE_REPLACEMENT != UNMASKED_DATE) ||
        (SECOND_REPLACEMENT != UNMASKED_DATE);

    /* un-mask range */
    if(!(params.length < 3 || StringUtils.isBlank(params[2]))) {
      String[] unmaskIndexes = params[2].split(",");

      for(int i=0; i < unmaskIndexes.length; i++ ) {
        String[] pair = unmaskIndexes[i].trim().split(":");
        unmaskIndexRanges.put(Integer.parseInt(pair[0]), Integer.parseInt(pair[1]));
      }
    }
  }

  @Override
  protected DataMask buildBooleanMask(TypeDescription schema) {
    if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) {
      return new LongIdentity();
    } else {
      return new BooleanRedactConverter();
    }
  }

  @Override
  protected DataMask buildLongMask(TypeDescription schema) {
    if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) {
      return new LongIdentity();
    } else {
      return new LongRedactConverter(schema.getCategory());
    }
  }

  @Override
  protected DataMask buildDecimalMask(TypeDescription schema) {
    if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) {
      return new DecimalIdentity();
    } else {
      return new DecimalRedactConverter();
    }
  }

  @Override
  protected DataMask buildDoubleMask(TypeDescription schema) {
    if (DIGIT_CP_REPLACEMENT == UNMASKED_CHAR) {
      return new DoubleIdentity();
    } else {
      return new DoubleRedactConverter();
    }
  }

  @Override
  protected DataMask buildStringMask(TypeDescription schema) {
    return new StringConverter();
  }

  @Override
  protected DataMask buildDateMask(TypeDescription schema) {
    if (maskDate) {
      return new DateRedactConverter();
    } else {
      return new LongIdentity();
    }
  }

  @Override
  protected DataMask buildTimestampMask(TypeDescription schema) {
    if (maskTimestamp) {
      return new TimestampRedactConverter();
    } else {
      return new TimestampIdentity();
    }
  }

  @Override
  protected DataMask buildBinaryMask(TypeDescription schema) {
    return new NullifyMask();
  }

  class LongRedactConverter implements DataMask {
    final long mask;

    LongRedactConverter(TypeDescription.Category category) {
      switch (category) {
        case BYTE:
          mask = 0xff;
          break;
        case SHORT:
          mask = 0xffff;
          break;
        case INT:
          mask = 0xffff_ffff;
          break;
        default:
        case LONG:
          mask = -1;
          break;
      }
    }

    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      LongColumnVector target = (LongColumnVector) masked;
      LongColumnVector source = (LongColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.vector[0] = maskLong(source.vector[0]) & mask;
        target.isNull[0] = source.isNull[0];
      } else {
        for(int r = start; r < start + length; ++r) {
          target.vector[r] = maskLong(source.vector[r]) & mask;
          target.isNull[r] = source.isNull[r];
        }
      }
    }
  }

  class BooleanRedactConverter implements DataMask {
    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      LongColumnVector target = (LongColumnVector) masked;
      LongColumnVector source = (LongColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.vector[0] = DIGIT_REPLACEMENT == 0 ? 0 : 1;
        target.isNull[0] = source.isNull[0];
      } else {
        for(int r = start; r < start + length; ++r) {
          target.vector[r] = DIGIT_REPLACEMENT == 0 ? 0 : 1;
          target.isNull[r] = source.isNull[r];
        }
      }
    }
  }

  class DoubleRedactConverter implements DataMask {
    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      DoubleColumnVector target = (DoubleColumnVector) masked;
      DoubleColumnVector source = (DoubleColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.vector[0] = maskDouble(source.vector[0]);
        target.isNull[0] = source.isNull[0];
      } else {
        for(int r = start; r < start + length; ++r) {
          target.vector[r] = maskDouble(source.vector[r]);
          target.isNull[r] = source.isNull[r];
        }
      }
    }
  }

  class StringConverter implements DataMask {
    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      BytesColumnVector target = (BytesColumnVector) masked;
      BytesColumnVector source = (BytesColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.isNull[0] = source.isNull[0];
        if (target.noNulls || !target.isNull[0]) {
          maskString(source, 0, target);
        }
      } else {
        for(int r = start; r < start + length; ++r) {
          target.isNull[r] = source.isNull[r];
          if (target.noNulls || !target.isNull[r]) {
            maskString(source, r, target);
          }
        }
      }
    }
  }

  class DecimalRedactConverter implements DataMask {
    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      DecimalColumnVector target = (DecimalColumnVector) masked;
      DecimalColumnVector source = (DecimalColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      target.scale = source.scale;
      target.precision = source.precision;
      if (original.isRepeating) {
        target.isNull[0] = source.isNull[0];
        if (target.noNulls || !target.isNull[0]) {
          target.vector[0].set(maskDecimal(source.vector[0]));
        }
      } else {
        for(int r = start; r < start + length; ++r) {
          target.isNull[r] = source.isNull[r];
          if (target.noNulls || !target.isNull[r]) {
            target.vector[r].set(maskDecimal(source.vector[r]));
          }
        }
      }
    }
  }

  class TimestampRedactConverter implements DataMask {

    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      TimestampColumnVector target = (TimestampColumnVector) masked;
      TimestampColumnVector source = (TimestampColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.isNull[0] = source.isNull[0];
        if (target.noNulls || !target.isNull[0]) {
          target.time[0] = maskTime(source.time[0]);
          target.nanos[0] = 0;
        }
      } else {
        for(int r = start; r < start + length; ++r) {
          target.isNull[r] = source.isNull[r];
          if (target.noNulls || !target.isNull[r]) {
            target.time[r] = maskTime(source.time[r]);
            target.nanos[r] = 0;
          }
        }
      }
    }
  }

  class DateRedactConverter implements DataMask {

    @Override
    public void maskData(ColumnVector original, ColumnVector masked, int start,
                         int length) {
      LongColumnVector target = (LongColumnVector) masked;
      LongColumnVector source = (LongColumnVector) original;
      target.noNulls = original.noNulls;
      target.isRepeating = original.isRepeating;
      if (original.isRepeating) {
        target.isNull[0] = source.isNull[0];
        if (target.noNulls || !target.isNull[0]) {
          target.vector[0] = maskDate((int) source.vector[0]);
        }
      } else {
        for(int r = start; r < start + length; ++r) {
          target.isNull[r] = source.isNull[r];
          if (target.noNulls || !target.isNull[r]) {
            target.vector[r] = maskDate((int) source.vector[r]);
          }
        }
      }
    }
  }

  /**
   * Get the next code point from the ByteBuffer. Moves the position in the
   * ByteBuffer forward to the next code point.
   * @param param the source of bytes
   * @param defaultValue if there are no bytes left, use this value
   * @return the code point that was found at the front of the buffer.
   */
  static int getNextCodepoint(ByteBuffer param, int defaultValue) {
    if (param.remaining() == 0) {
      return defaultValue;
    } else {
      return Text.bytesToCodePoint(param);
    }
  }

  /**
   * Get the replacement digit. This routine supports non-ASCII values for the
   * replacement. For example, if the user gives one of "7", "७", "〧" or "፯"
   * the value is 7.
   * @param digitCodePoint the code point that is replacing digits
   * @return the number from 0 to 9 to use as the numeric replacement
   */
  static int getReplacementDigit(int digitCodePoint) {
    int dig = Character.getNumericValue(digitCodePoint);
    if (dig >= 0 && dig <= 9) {
      return dig;
    } else {
      return DEFAULT_NUMBER_DIGIT;
    }
  }

  static int getDateParam(String[] dateParams, int posn,
                          int myDefault, int max) {
    if (dateParams != null && posn < dateParams.length) {
      if (dateParams[posn].codePointAt(0) == UNMASKED_CHAR) {
        return UNMASKED_DATE;
      } else {
        int result = Integer.parseInt(dateParams[posn]);
        if (result >= -1 && result <= max) {
          return result;
        } else {
          throw new IllegalArgumentException("Invalid date parameter " + posn +
              " of " + dateParams[posn] + " greater than " + max);
        }
      }
    } else {
      return myDefault;
    }
  }

  /**
   * Replace each digit in value with DIGIT_REPLACEMENT scaled to the matching
   * number of digits.
   * @param value the number to mask
   * @return the masked value
   */
  public long maskLong(long value) {

    /* check whether unmasking range provided */
    if (!unmaskIndexRanges.isEmpty()) {
      return maskLongWithUnmasking(value);
    }

    long base;
    if (DIGIT_REPLACEMENT == 0) {
      return 0;
    } else if (value >= 0) {
      base = 1;
    } else {
      base = -1;
      // make sure Long.MIN_VALUE doesn't overflow
      if (value == Long.MIN_VALUE) {
        value = Long.MAX_VALUE;
      } else {
        value = -value;
      }
    }
    if (value < 100_000_000L) {
      if (value < 10_000L) {
        if (value < 100L) {
          if (value < 10L) {
            base *= 1;
          } else {
            base *= 11;
          }
        } else if (value < 1_000L) {
          base *= 111;
        } else {
          base *= 1_111;
        }
      } else if (value < 1_000_000L) {
        if (value < 100_000L) {
          base *= 11_111;
        } else {
          base *= 111_111;
        }
      } else if (value < 10_000_000L) {
        base *= 1_111_111;
      } else {
        base *= 11_111_111;
      }
    } else if (value < 10_000_000_000_000_000L) {
      if (value < 1_000_000_000_000L) {
        if (value < 10_000_000_000L) {
          if (value < 1_000_000_000L) {
            base *= 111_111_111;
          } else {
            base *= 1_111_111_111;
          }
        } else if (value < 100_000_000_000L) {
          base *= 11_111_111_111L;
        } else {
          base *= 111_111_111_111L;
        }
      } else if (value < 100_000_000_000_000L) {
        if (value < 10_000_000_000_000L) {
          base *= 1_111_111_111_111L;
        } else {
          base *= 11_111_111_111_111L;
        }
      } else if (value < 1_000_000_000_000_000L) {
        base *= 111_111_111_111_111L;
      } else {
        base *= 1_111_111_111_111_111L;
      }
    } else if (value < 100_000_000_000_000_000L) {
      base *= 11_111_111_111_111_111L;
    // If the digit is 9, it would overflow at 19 digits, so use 18.
    } else if (value < 1_000_000_000_000_000_000L || DIGIT_REPLACEMENT == 9) {
      base *= 111_111_111_111_111_111L;
    } else {
      base *= 1_111_111_111_111_111_111L;
    }

    return DIGIT_REPLACEMENT * base;
  }

  private static final double[] DOUBLE_POWER_10 = new double[]{
      1e-308, 1e-307, 1e-306, 1e-305, 1e-304, 1e-303, 1e-302, 1e-301, 1e-300,
      1e-299, 1e-298, 1e-297, 1e-296, 1e-295, 1e-294, 1e-293, 1e-292, 1e-291,
      1e-290, 1e-289, 1e-288, 1e-287, 1e-286, 1e-285, 1e-284, 1e-283, 1e-282,
      1e-281, 1e-280, 1e-279, 1e-278, 1e-277, 1e-276, 1e-275, 1e-274, 1e-273,
      1e-272, 1e-271, 1e-270, 1e-269, 1e-268, 1e-267, 1e-266, 1e-265, 1e-264,
      1e-263, 1e-262, 1e-261, 1e-260, 1e-259, 1e-258, 1e-257, 1e-256, 1e-255,
      1e-254, 1e-253, 1e-252, 1e-251, 1e-250, 1e-249, 1e-248, 1e-247, 1e-246,
      1e-245, 1e-244, 1e-243, 1e-242, 1e-241, 1e-240, 1e-239, 1e-238, 1e-237,
      1e-236, 1e-235, 1e-234, 1e-233, 1e-232, 1e-231, 1e-230, 1e-229, 1e-228,
      1e-227, 1e-226, 1e-225, 1e-224, 1e-223, 1e-222, 1e-221, 1e-220, 1e-219,
      1e-218, 1e-217, 1e-216, 1e-215, 1e-214, 1e-213, 1e-212, 1e-211, 1e-210,
      1e-209, 1e-208, 1e-207, 1e-206, 1e-205, 1e-204, 1e-203, 1e-202, 1e-201,
      1e-200, 1e-199, 1e-198, 1e-197, 1e-196, 1e-195, 1e-194, 1e-193, 1e-192,
      1e-191, 1e-190, 1e-189, 1e-188, 1e-187, 1e-186, 1e-185, 1e-184, 1e-183,
      1e-182, 1e-181, 1e-180, 1e-179, 1e-178, 1e-177, 1e-176, 1e-175, 1e-174,
      1e-173, 1e-172, 1e-171, 1e-170, 1e-169, 1e-168, 1e-167, 1e-166, 1e-165,
      1e-164, 1e-163, 1e-162, 1e-161, 1e-160, 1e-159, 1e-158, 1e-157, 1e-156,
      1e-155, 1e-154, 1e-153, 1e-152, 1e-151, 1e-150, 1e-149, 1e-148, 1e-147,
      1e-146, 1e-145, 1e-144, 1e-143, 1e-142, 1e-141, 1e-140, 1e-139, 1e-138,
      1e-137, 1e-136, 1e-135, 1e-134, 1e-133, 1e-132, 1e-131, 1e-130, 1e-129,
      1e-128, 1e-127, 1e-126, 1e-125, 1e-124, 1e-123, 1e-122, 1e-121, 1e-120,
      1e-119, 1e-118, 1e-117, 1e-116, 1e-115, 1e-114, 1e-113, 1e-112, 1e-111,
      1e-110, 1e-109, 1e-108, 1e-107, 1e-106, 1e-105, 1e-104, 1e-103, 1e-102,
      1e-101, 1e-100, 1e-99, 1e-98, 1e-97, 1e-96, 1e-95, 1e-94, 1e-93,
      1e-92, 1e-91, 1e-90, 1e-89, 1e-88, 1e-87, 1e-86, 1e-85, 1e-84,
      1e-83, 1e-82, 1e-81, 1e-80, 1e-79, 1e-78, 1e-77, 1e-76, 1e-75,
      1e-74, 1e-73, 1e-72, 1e-71, 1e-70, 1e-69, 1e-68, 1e-67, 1e-66,
      1e-65, 1e-64, 1e-63, 1e-62, 1e-61, 1e-60, 1e-59, 1e-58, 1e-57,
      1e-56, 1e-55, 1e-54, 1e-53, 1e-52, 1e-51, 1e-50, 1e-49, 1e-48,
      1e-47, 1e-46, 1e-45, 1e-44, 1e-43, 1e-42, 1e-41, 1e-40, 1e-39,
      1e-38, 1e-37, 1e-36, 1e-35, 1e-34, 1e-33, 1e-32, 1e-31, 1e-30,
      1e-29, 1e-28, 1e-27, 1e-26, 1e-25, 1e-24, 1e-23, 1e-22, 1e-21,
      1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 1e-13, 1e-12,
      1e-11, 1e-10, 1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3,
      1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6,
      1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
      1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24,
      1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33,
      1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42,
      1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51,
      1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60,
      1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69,
      1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78,
      1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87,
      1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96,
      1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105,
      1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114,
      1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123,
      1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132,
      1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141,
      1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150,
      1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159,
      1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168,
      1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177,
      1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186,
      1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195,
      1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204,
      1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213,
      1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222,
      1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231,
      1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240,
      1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249,
      1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258,
      1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267,
      1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276,
      1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285,
      1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294,
      1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303,
      1e304, 1e305, 1e306, 1e307};

  /**
   * Replace each digit in value with digit.
   * @param value the number to mask
   * @return the
   */
  public double maskDouble(double value) {

    /* check whether unmasking range provided */
    if (!unmaskIndexRanges.isEmpty()) {
      return maskDoubleWIthUnmasking(value);
    }

    double base;
    // It seems better to mask 0 to 9.99999 rather than 9.99999e-308.
    if (value == 0 || DIGIT_REPLACEMENT == 0) {
      return DIGIT_REPLACEMENT * 1.11111;
    } else if (value > 0) {
      base = 1.11111;
    } else {
      base = -1.11111;
      value = -value;
    }
    int posn = Arrays.binarySearch(DOUBLE_POWER_10, value);
    if (posn < -DOUBLE_POWER_10.length - 2) {
      posn = DOUBLE_POWER_10.length - 1;
    } else if (posn == -1) {
      posn = 0;
    } else if (posn < 0) {
      posn = -posn -2;
    }
    return DIGIT_REPLACEMENT * base * DOUBLE_POWER_10[posn];
  }

  private final Calendar scratch = Calendar.getInstance();

  /**
   * Given the requested masking parameters, redact the given time
   * @param millis the original time
   * @return the millis after it has been masked
   */
  long maskTime(long millis) {
    scratch.setTimeInMillis(millis);
    if (YEAR_REPLACEMENT != UNMASKED_DATE) {
      scratch.set(Calendar.YEAR, YEAR_REPLACEMENT);
    }
    if (MONTH_REPLACEMENT != UNMASKED_DATE) {
      scratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1);
    }
    if (DATE_REPLACEMENT != UNMASKED_DATE) {
      scratch.set(Calendar.DATE, DATE_REPLACEMENT);
    }
    if (HOUR_REPLACEMENT != UNMASKED_DATE) {
      if (HOUR_REPLACEMENT >= 12) {
        scratch.set(Calendar.HOUR, HOUR_REPLACEMENT - 12);
        scratch.set(Calendar.AM_PM, Calendar.PM);
      } else {
        scratch.set(Calendar.HOUR, HOUR_REPLACEMENT);
        scratch.set(Calendar.AM_PM, Calendar.AM);
      }
    }
    if (MINUTE_REPLACEMENT != UNMASKED_DATE) {
      scratch.set(Calendar.MINUTE, MINUTE_REPLACEMENT);
    }
    if (SECOND_REPLACEMENT != UNMASKED_DATE) {
      scratch.set(Calendar.SECOND, SECOND_REPLACEMENT);
      scratch.set(Calendar.MILLISECOND, 0);
    }
    return scratch.getTimeInMillis();
  }

  private static final long MILLIS_PER_DAY = TimeUnit.DAYS.toMillis(1);

  private final Calendar utcScratch =
      Calendar.getInstance(TimeZone.getTimeZone("UTC"));

  /**
   * Given a date as the number of days since epoch (1 Jan 1970),
   * mask the date given the parameters.
   * @param daysSinceEpoch the number of days after epoch
   * @return the number of days after epoch when masked
   */
  int maskDate(int daysSinceEpoch) {
    utcScratch.setTimeInMillis(daysSinceEpoch * MILLIS_PER_DAY);
    if (YEAR_REPLACEMENT != UNMASKED_DATE) {
      utcScratch.set(Calendar.YEAR, YEAR_REPLACEMENT);
    }
    if (MONTH_REPLACEMENT != UNMASKED_DATE) {
      utcScratch.set(Calendar.MONTH, MONTH_REPLACEMENT - 1);
    }
    if (DATE_REPLACEMENT != UNMASKED_DATE) {
      utcScratch.set(Calendar.DATE, DATE_REPLACEMENT);
    }
    return (int) (utcScratch.getTimeInMillis() / MILLIS_PER_DAY);
  }

  /**
   * Mask a decimal.
   * This is painfully slow because it converts to a string and then back to
   * a decimal. Until HiveDecimalWritable gives us more access, this is
   * the best tradeoff between developer time, functionality, and run time.
   * @param source the value to mask
   * @return the masked value.
   */
  HiveDecimalWritable maskDecimal(HiveDecimalWritable source) {
    return new HiveDecimalWritable(maskNumericString(source.toString()));
  }

  /**
   * Given a UTF code point, find the replacement codepoint
   * @param codepoint a UTF character
   * @return the replacement codepoint
   */
  int getReplacement(int codepoint) {
    switch (Character.getType(codepoint)) {
      case Character.UPPERCASE_LETTER:
        return UPPPER_REPLACEMENT;
      case Character.LOWERCASE_LETTER:
        return LOWER_REPLACEMENT;
      case Character.TITLECASE_LETTER:
      case Character.MODIFIER_LETTER:
      case Character.OTHER_LETTER:
        return OTHER_LETTER_REPLACEMENT;
      case Character.NON_SPACING_MARK:
      case Character.ENCLOSING_MARK:
      case Character.COMBINING_SPACING_MARK:
        return MARK_REPLACEMENT;
      case Character.DECIMAL_DIGIT_NUMBER:
        return DIGIT_CP_REPLACEMENT;
      case Character.LETTER_NUMBER:
      case Character.OTHER_NUMBER:
        return OTHER_NUMBER_REPLACEMENT;
      case Character.SPACE_SEPARATOR:
      case Character.LINE_SEPARATOR:
      case Character.PARAGRAPH_SEPARATOR:
        return SEPARATOR_REPLACEMENT;
      case Character.MATH_SYMBOL:
      case Character.CURRENCY_SYMBOL:
      case Character.MODIFIER_SYMBOL:
      case Character.OTHER_SYMBOL:
        return SYMBOL_REPLACEMENT;
      case Character.DASH_PUNCTUATION:
      case Character.START_PUNCTUATION:
      case Character.END_PUNCTUATION:
      case Character.CONNECTOR_PUNCTUATION:
      case Character.OTHER_PUNCTUATION:
        return PUNCTUATION_REPLACEMENT;
      default:
        return OTHER_REPLACEMENT;
    }
  }

  /**
   * Get the number of bytes for each codepoint
   * @param codepoint the codepoint to check
   * @return the number of bytes
   */
  static int getCodepointLength(int codepoint) {
    if (codepoint < 0) {
      throw new IllegalArgumentException("Illegal codepoint " + codepoint);
    } else if (codepoint < 0x80) {
      return 1;
    } else if (codepoint < 0x7ff) {
      return 2;
    } else if (codepoint < 0xffff) {
      return 3;
    } else if (codepoint < 0x10FFFF) {
      return 4;
    } else {
      throw new IllegalArgumentException("Illegal codepoint " + codepoint);
    }
  }

  /**
   * Write the give codepoint to the buffer.
   * @param codepoint the codepoint to write
   * @param buffer the buffer to write into
   * @param offset the first offset to use
   * @param length the number of bytes that will be used
   */
  static void writeCodepoint(int codepoint, byte[] buffer, int offset,
                             int length) {
    switch (length) {
      case 1:
        buffer[offset] = (byte) codepoint;
        break;
      case 2:
        buffer[offset] = (byte)(0xC0 | codepoint >> 6);
        buffer[offset+1] = (byte)(0x80 | (codepoint & 0x3f));
        break;
      case 3:
        buffer[offset] = (byte)(0xE0 | codepoint >> 12);
        buffer[offset+1] = (byte)(0x80 | ((codepoint >> 6) & 0x3f));
        buffer[offset+2] = (byte)(0x80 | (codepoint & 0x3f));
        break;
      case 4:
        buffer[offset] = (byte)(0xF0 | codepoint >> 18);
        buffer[offset+1] = (byte)(0x80 | ((codepoint >> 12) & 0x3f));
        buffer[offset+2] = (byte)(0x80 | ((codepoint >> 6) & 0x3f));
        buffer[offset+3] = (byte)(0x80 | (codepoint & 0x3f));
        break;
      default:
        throw new IllegalArgumentException("Invalid length for codepoint " +
            codepoint + " = " + length);
    }
  }

  /**
   * Mask a string by finding the character category of each character
   * and replacing it with the matching literal.
   * @param source the source column vector
   * @param row the value index
   * @param target the target column vector
   */
  void maskString(BytesColumnVector source, int row, BytesColumnVector target) {
    int expectedBytes = source.length[row];
    ByteBuffer sourceBytes = ByteBuffer.wrap(source.vector[row],
        source.start[row], source.length[row]);
    // ensure we have enough space, if the masked data is the same size
    target.ensureValPreallocated(expectedBytes);
    byte[] outputBuffer = target.getValPreallocatedBytes();
    int outputOffset = target.getValPreallocatedStart();
    int outputStart = outputOffset;

    int index = 0;
    while (sourceBytes.remaining() > 0) {
      int cp = Text.bytesToCodePoint(sourceBytes);

      // Find the replacement for the current character.
      int replacement = getReplacement(cp);
      if (replacement == UNMASKED_CHAR || isIndexInUnmaskRange(index, source.length[row])) {
        replacement = cp;
      }

      // increment index
      index++;

      int len = getCodepointLength(replacement);

      // If the translation will overflow the buffer, we need to resize.
      // This will only happen when the masked size is larger than the original.
      if (len + outputOffset > outputBuffer.length) {
        // Revise estimate how much we are going to need now. We are maximally
        // pesamistic here so that we don't have to expand again for this value.
        int currentOutputStart =  outputStart;
        int currentOutputLength = outputOffset - currentOutputStart;
        expectedBytes = currentOutputLength + len + sourceBytes.remaining() * 4;

        // Expand the buffer to fit the new estimate
        target.ensureValPreallocated(expectedBytes);

        // Copy over the bytes we've already written for this value and move
        // the pointers to the new output buffer.
        byte[] oldBuffer = outputBuffer;
        outputBuffer = target.getValPreallocatedBytes();
        outputOffset = target.getValPreallocatedStart();
        outputStart = outputOffset;
        System.arraycopy(oldBuffer, currentOutputStart, outputBuffer,
            outputOffset, currentOutputLength);
        outputOffset += currentOutputLength;
      }

      // finally copy the bytes
      writeCodepoint(replacement, outputBuffer, outputOffset, len);
      outputOffset += len;
    }
    target.setValPreallocated(row, outputOffset - outputStart);
  }

  static final long OVERFLOW_REPLACEMENT = 111_111_111_111_111_111L;

  /**
   * A function that masks longs when there are unmasked ranges.
   * @param value the original value
   * @return the masked value
   */
  long maskLongWithUnmasking(long value) throws IndexOutOfBoundsException {
    try {
      return Long.parseLong(maskNumericString(Long.toString(value)));
    } catch (NumberFormatException nfe) {
      return OVERFLOW_REPLACEMENT * DIGIT_REPLACEMENT;
    }
  }

  /**
   * A function that masks doubles when there are unmasked ranges.
   * @param value original value
   * @return masked value
   */
  double maskDoubleWIthUnmasking(final double value) {
    try {
      return Double.parseDouble(maskNumericString(Double.toString(value)));
    } catch (NumberFormatException nfe) {
      return OVERFLOW_REPLACEMENT * DIGIT_REPLACEMENT;
    }
  }

  /**
   * Mask the given stringified numeric value excluding the unmask range.
   * Non-digit characters are passed through on the assumption they are
   * markers (eg. one of ",.ef").
   * @param value the original value.
   */
  String maskNumericString(final String value) {
    StringBuilder result = new StringBuilder();
    final int length = value.codePointCount(0, value.length());
    for(int c=0; c < length; ++c) {
      int cp = value.codePointAt(c);
      if (isIndexInUnmaskRange(c, length) ||
          Character.getType(cp) != Character.DECIMAL_DIGIT_NUMBER) {
        result.appendCodePoint(cp);
      } else {
        result.appendCodePoint(DIGIT_CP_REPLACEMENT);
      }
    }
    return result.toString();
  }

  /**
   * Given an index and length of a string
   * find out whether it is in a given un-mask range.
   * @param index the character point index
   * @param length the length of the string in character points
   * @return true if the index is in un-mask range else false.
   */
  private boolean isIndexInUnmaskRange(final int index, final int length) {

    for(final Map.Entry pair : unmaskIndexRanges.entrySet()) {
       int start;
       int end;

       if(pair.getKey() >= 0) {
         // for positive indexes
         start = pair.getKey();
       } else {
         // for negative indexes
         start = length + pair.getKey();
       }

       if(pair.getValue() >= 0) {
         // for positive indexes
         end = pair.getValue();
       } else {
         // for negative indexes
         end = length + pair.getValue();
       }

      // if the given index is in range
      if(index >= start && index <= end ) {
        return true;
      }

    }

    return false;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy