All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.sona.tree.gbdt.metadata.FeatureInfo.scala Maven / Gradle / Ivy

/*
 * Tencent is pleased to support the open source community by making Angel available.
 *
 * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */
package com.tencent.angel.sona.tree.gbdt.metadata

import com.tencent.angel.sona.tree.util.MathUtil

import java.{util => ju}
import scala.util.Random

object FeatureInfo {
  private[gbdt] val ENUM_THRESHOLD = 16

  private[gbdt] def apply(featTypes: Array[Boolean], numBin: Array[Int],
                          splits: Array[Array[Float]], defaultBins: Array[Int]): FeatureInfo = {

    val isFeatUsed = numBin.map(_ > 0)
    val empCnt = splits.count(_ == null)
    val numCnt = (splits, featTypes).zipped.count(p => p._1 != null && !p._2)
    val catCnt = (splits, featTypes).zipped.count(p => p._1 != null && p._2)
    println(s"Feature info: empty[$empCnt], numerical[$numCnt], categorical[$catCnt]")
    FeatureInfo(featTypes, numBin, splits, defaultBins, isFeatUsed)
  }

  private[gbdt] def apply(featTypes: Array[Boolean], splits: Array[Array[Float]]): FeatureInfo = {
    require(featTypes.length == splits.length)
    val numFeature = featTypes.length
    val numBin = Array.ofDim[Int](numFeature)
    val defaultBins = Array.ofDim[Int](numFeature)
    for (i <- 0 until numFeature) {
      if (splits(i) != null) {
        if (featTypes(i)) {
          numBin(i) = splits(i).length + 1
          defaultBins(i) = splits(i).length
        } else {
          numBin(i) = splits(i).length
          defaultBins(i) = MathUtil.indexOf(splits(i), 0.0f)  // TODO: default bin for continuous feature
        }
      }
    }
    apply(featTypes, numBin, splits, defaultBins)
  }

  private[gbdt] def apply(splits: Array[Array[Float]]): FeatureInfo = {
    apply(splits.map(s => s != null && s.length < ENUM_THRESHOLD), splits)
  }
}

case class FeatureInfo(featTypes: Array[Boolean], numBin: Array[Int], splits: Array[Array[Float]],
                       defaultBins: Array[Int], isFeatUsed: Array[Boolean]) {

  @inline def isCategorical(fid: Int) = featTypes(fid)

  @inline def getNumBin(fid: Int) = numBin(fid)

  @inline def getSplits(fid: Int) = splits(fid)

  @inline def getDefaultBin(fid: Int) = defaultBins(fid)

  def sample(ratio: Float, seed: Option[Long] = None): Boolean = {
    val numFeat = numFeature
    val numSample = Math.ceil(numFeat * ratio).toInt
    if (numSample < numFeat) {
      ju.Arrays.fill(isFeatUsed, false)
      Random.setSeed(seed.getOrElse(
        java.lang.Double.doubleToLongBits(Math.random())
      ))
      for (_ <- 0 until numSample) {
        val randFid = Random.nextInt(numFeat)
        isFeatUsed(randFid) = getNumBin(randFid) > 0
      }
      true
    } else {
      false
    }
  }

  lazy val numFeature: Int = featTypes.length
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy