Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.uci.jforestsx.input;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import edu.uci.jforestsx.dataset.BitNumericArray;
import edu.uci.jforestsx.dataset.ByteNumericArray;
import edu.uci.jforestsx.dataset.NullNumericArray;
import edu.uci.jforestsx.dataset.ShortNumericArray;
import edu.uci.jforestsx.dataset.Feature;
import edu.uci.jforestsx.dataset.NumericArray;
import edu.uci.jforestsx.input.sparse.SparseTextFileLine;
import edu.uci.jforestsx.input.sparse.SparseTextFileReader;
import edu.uci.jforestsx.util.ArraysUtil;
import edu.uci.jforestsx.util.Timer;
/**
* @author Yasser Ganjisaffar
*/
public class BinaryFileGenerator {
private List> valueHashMaps;
private int[][] valueDistributions;
private int[] totalCount;
private NumericArray[] bins;
protected double[] targets;
protected Feature[] features;
private FeatureAnalyzer featureAnalyzer;
private int featureCount;
private int instanceCount;
private String textFile;
private String featuresStatFile;
private Timer timer;
protected String binFile;
protected BinaryFileWriter writer;
public BinaryFileGenerator(String textFile, String featuresStatFile, String binFile) {
this.textFile = textFile;
this.featuresStatFile = featuresStatFile;
this.binFile = binFile;
timer = new Timer();
}
protected void handle(SparseTextFileLine line) {
// Subclasses will override this function if needed
}
protected void loadValueHashMaps() {
timer.start();
System.out.print("Loading values...");
valueHashMaps = new ArrayList>(featureCount);
for (int f = 0; f < featureCount; f++) {
valueHashMaps.add(new HashMap());
}
SparseTextFileReader reader = new SparseTextFileReader();
reader.open(textFile);
SparseTextFileLine line = new SparseTextFileLine();
HashMap curMap;
int key;
instanceCount = 0;
while (reader.loadNextLine(line)) {
if (line.meta) {
continue;
}
for (int i = 0; i < line.numPairs; i++) {
FeatureValuePair pair = line.pairs[i];
curMap = valueHashMaps.get(pair.featureIndex - 1);
key = (int) pair.featureValue;
Integer count = curMap.get(key);
if (count == null) {
count = 0;
}
count++;
curMap.put(key, count);
}
handle(line);
instanceCount++;
}
reader.close();
System.out.println(" [Done in: " + timer.getElapsedSeconds() + " seconds.]");
}
private void makeDistributions() {
timer.start();
System.out.print("Making distributions...");
valueDistributions = new int[featureCount][];
totalCount = new int[featureCount];
List valueMap = new ArrayList();
HashMap curMap;
for (int f = 0; f < featureCount; f++) {
valueMap.clear();
curMap = valueHashMaps.get(f);
totalCount[f] = 0;
for (Entry entry : curMap.entrySet()) {
valueMap.add(entry.getKey());
totalCount[f] += entry.getValue();
}
if (!valueMap.contains(0)) {
valueMap.add(0);
}
Collections.sort(valueMap);
valueDistributions[f] = ArraysUtil.toArray(valueMap);
}
System.out.println(" [Done in: " + timer.getElapsedSeconds() + " seconds.]");
}
private void makeBins() throws Exception {
System.out.println("Making bins...");
timer.start();
bins = new NumericArray[featureCount];
for (int i = 0; i < featureCount; i++) {
int numValues = valueDistributions[i].length;
if (numValues == 1 && valueDistributions[i][0] == 0) {
bins[i] = NullNumericArray.getInstance();
} else if (numValues <= 2) {
bins[i] = new BitNumericArray(instanceCount);
} else if (numValues <= Byte.MAX_VALUE) {
bins[i] = new ByteNumericArray(instanceCount);
} else if (numValues <= Short.MAX_VALUE) {
bins[i] = new ShortNumericArray(instanceCount);
} else {
throw new Exception("One of your features have more than " + Short.MAX_VALUE
+ " distinct values. The support for this feature is not implemented yet.");
}
System.out.println("Feature: " + i + ", type: " + bins[i].getType().toString());
}
targets = new double[instanceCount];
int[] zeroCount = new int[featureCount];
SparseTextFileReader reader = new SparseTextFileReader();
reader.open(textFile);
SparseTextFileLine line = new SparseTextFileLine();
int instanceIdx = 0;
while (reader.loadNextLine(line)) {
if (line.meta) {
continue;
}
targets[instanceIdx] = line.target;
for (int i = 0; i < line.numPairs; i++) {
FeatureValuePair pair = line.pairs[i];
int fidx = pair.featureIndex - 1;
int index = Arrays.binarySearch(valueDistributions[fidx], (int) pair.featureValue);
bins[fidx].set(instanceIdx, index);
if (index == 0) {
zeroCount[fidx]++;
}
}
instanceIdx++;
}
reader.close();
System.out.println(" [Done in: " + timer.getElapsedSeconds() + " seconds.]");
}
private void makeFeatures() {
System.out.print("Making features...");
timer.start();
features = new Feature[featureCount];
for (int f = 0; f < featureCount; f++) {
features[f] = new Feature();
features[f].bins = bins[f];
features[f].upperBounds = valueDistributions[f];
features[f].setName(featureAnalyzer.getFeatureName(f + 1));
features[f].setMin(featureAnalyzer.min[f]);
features[f].setMax(featureAnalyzer.max[f]);
features[f].setFactor(featureAnalyzer.factor[f]);
features[f].setOnLogScale(featureAnalyzer.onLogScale[f]);
}
System.out.println(" [Done in: " + timer.getElapsedSeconds() + " seconds.]");
}
protected void createBinFile() {
writer = new BinaryFileWriter(binFile, features, targets);
}
private void writeBinFile() {
timer.start();
System.out.print("Creating bin file...");
writer.write();
writer.close();
System.out.println(" [Done in: " + timer.getElapsedSeconds() + " seconds.]");
}
public void convert() throws Exception {
if (new File(binFile).exists()) {
System.out.println("File: " + binFile + " already exists. Skipping it.");
return;
}
featureAnalyzer = new FeatureAnalyzer();
featureAnalyzer.loadFeaturesFromFile(featuresStatFile);
featureCount = featureAnalyzer.getFeatureCount();
loadValueHashMaps();
makeDistributions();
makeBins();
makeFeatures();
createBinFile();
writeBinFile();
}
/**
* SISTA added code: Converts the discrete features of a datum to Feature objects suitable for classification
* This functionality must be aligned with the convert() method, which works on an entire dataset
* @param discreteSparseFeatures
* @param featureAnalyzer
* @return
*/
public static Feature [] convert(FeatureValuePair [] discreteSparseFeatures, FeatureAnalyzer featureAnalyzer) {
int featureCount = featureAnalyzer.getFeatureCount();
List> valueHashMaps = mkValueHashMaps(featureCount, discreteSparseFeatures);
int[][] valueDistributions = mkDistributions(valueHashMaps, featureCount);
NumericArray[] bins = mkBins(valueDistributions, featureCount, discreteSparseFeatures);
Feature[] features = mkFeatures(valueDistributions, bins, featureCount);
return features;
}
/** SISTA added code; aligned with makeFeatures */
private static Feature[] mkFeatures(
int[][] valueDistributions,
NumericArray[] bins,
int featureCount) {
Feature[] features = new Feature[featureCount];
for (int f = 0; f < featureCount; f++) {
features[f] = new Feature();
features[f].bins = bins[f];
features[f].upperBounds = valueDistributions[f];
}
return features;
}
/** SISTA added code; aligned with makeBins */
private static NumericArray[] mkBins(
int[][] valueDistributions,
int featureCount,
FeatureValuePair [] discreteSparseFeatures) {
NumericArray[] bins = new NumericArray[featureCount];
for (int i = 0; i < featureCount; i++) {
int numValues = valueDistributions[i].length;
if (numValues == 1 && valueDistributions[i][0] == 0) {
bins[i] = NullNumericArray.getInstance();
} else if (numValues <= 2) {
bins[i] = new BitNumericArray(1);
} else {
throw new RuntimeException("One of your features have more than 2"
+ " distinct values. The support for this feature is not implemented yet.");
}
}
for(int i = 0; i < discreteSparseFeatures.length; i ++) {
int fidx = discreteSparseFeatures[i].featureIndex;
double value = discreteSparseFeatures[i].featureValue;
int index = Arrays.binarySearch(valueDistributions[fidx], (int) value);
bins[fidx].set(0, index);
}
return bins;
}
/** SISTA added code; aligned with loadValueHashMaps */
private static List> mkValueHashMaps(
int featureCount,
FeatureValuePair [] discreteSparseFeatures) {
List> valueHashMaps =
new ArrayList>(featureCount);
for (int f = 0; f < featureCount; f++) {
valueHashMaps.add(new HashMap());
}
for(int i = 0; i < discreteSparseFeatures.length; i ++) {
int idx = discreteSparseFeatures[i].featureIndex;
int key = (int) discreteSparseFeatures[i].featureValue;
HashMap curMap = valueHashMaps.get(idx);
Integer count = curMap.get(key);
if (count == null) {
count = 0;
}
count++;
curMap.put(key, count);
}
return valueHashMaps;
}
/** SISTA added code; aligned with makeDistributions */
private static int[][] mkDistributions(
List> valueHashMaps,
int featureCount) {
int [][] valueDistributions = new int[featureCount][];
List valueMap = new ArrayList();
HashMap curMap;
for (int f = 0; f < featureCount; f++) {
valueMap.clear();
curMap = valueHashMaps.get(f);
for (Entry entry : curMap.entrySet()) {
valueMap.add(entry.getKey());
}
if (!valueMap.contains(0)) {
valueMap.add(0);
}
Collections.sort(valueMap);
valueDistributions[f] = ArraysUtil.toArray(valueMap);
}
return valueDistributions;
}
}