org.apache.ctakes.ytex.kernel.BaseSparseDataFormatter Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.ytex.kernel;
import com.google.common.collect.BiMap;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
public abstract class BaseSparseDataFormatter implements SparseDataFormatter {
protected KernelUtil kernelUtil;
/**
* directory to export files to, with trailing separator added on if
* necessary
*/
protected String outdir = null;
/**
* map of numeric attribute - attribute index.
*/
protected Map numericAttributeMap = new HashMap();
/**
* map of nominal attribute - [nominal attribute value - attribute index].
*/
protected Map> nominalAttributeMap = new HashMap>();
/**
* map of label - [class name - class index]
*/
protected Map> labelToClassIndexMap = new HashMap>();
/**
* 1-based attribute index
*/
protected int maxAttributeIndex = 0;
/**
* export properties - properties file that controls what to do for this
* export
*/
protected Properties exportProperties;
public BaseSparseDataFormatter(KernelUtil kernelUtil) {
this.kernelUtil = kernelUtil;
}
protected void exportAttributeNames(SparseData sparseData, String label,
Integer run, Integer fold) throws IOException {
// reset attribute name/index state
this.nominalAttributeMap.clear();
this.numericAttributeMap.clear();
this.maxAttributeIndex = 0;
// construct file name
String filename = FileUtil.getScopedFileName(outdir, label, run, fold,
"attributes.txt");
BufferedWriter w = null;
try {
w = new BufferedWriter(new FileWriter(filename));
// write attributes
exportAttributeNames(w, sparseData);
} finally {
if (w != null)
w.close();
}
}
/**
* assign indices to each attribute.
*
* @param outdir
* directory to write file to
* @param sparseData
* @param numericAttributeMap
* @param nominalAttributeMap
* for nominal indices, create an index for each value.
* @throws IOException
*/
protected int exportAttributeNames(BufferedWriter w, SparseData sparseData)
throws IOException {
// add numeric indices
for (String attributeName : sparseData.getNumericWords()) {
addNumericAttribute(w, attributeName);
}
// add nominal indices
for (SortedMap.Entry> nominalAttribute : sparseData
.getNominalWordValueMap().entrySet()) {
Map attrValueIndexMap = new HashMap(
nominalAttribute.getValue().size());
for (String attrValue : nominalAttribute.getValue()) {
w.write(nominalAttribute.getKey());
if (nominalAttribute.getValue().size() > 1) {
w.write("\t");
w.write(attrValue);
}
w.write("\n");
attrValueIndexMap.put(attrValue, ++maxAttributeIndex);
}
nominalAttributeMap.put(nominalAttribute.getKey(),
attrValueIndexMap);
}
return maxAttributeIndex;
}
protected void addNumericAttribute(BufferedWriter w, String attributeName)
throws IOException {
w.write(attributeName);
w.write("\n");
numericAttributeMap.put(attributeName, ++maxAttributeIndex);
}
/**
* create a map of attribute index - attribute value for the given instance.
*
* @param bagOfWordsData
* @param numericAttributeMap
* @param nominalAttributeMap
* @param instanceId
* @return
*/
protected SortedMap getSparseLineValues(
SparseData bagOfWordsData,
Map numericAttributeMap,
Map> nominalAttributeMap,
long instanceId) {
SortedMap instanceValues = new TreeMap();
// get numeric values for instance
if (bagOfWordsData.getInstanceNumericWords().containsKey(instanceId)) {
for (Map.Entry numericValue : bagOfWordsData
.getInstanceNumericWords().get(instanceId).entrySet()) {
// look up index for attribute and put in map
instanceValues.put(
numericAttributeMap.get(numericValue.getKey()),
numericValue.getValue());
}
}
if (bagOfWordsData.getInstanceNominalWords().containsKey(instanceId)) {
for (Map.Entry nominalValue : bagOfWordsData
.getInstanceNominalWords().get(instanceId).entrySet()) {
// look up index for attribute and value and put in map
instanceValues.put(
nominalAttributeMap.get(nominalValue.getKey()).get(
nominalValue.getValue()), 1d);
}
}
return instanceValues;
}
protected void exportSparseRow(SparseData bagOfWordsData, long instanceId,
BufferedWriter wData, int row) throws IOException {
SortedMap instanceValues = getSparseLineValues(
bagOfWordsData, numericAttributeMap, nominalAttributeMap,
instanceId);
// write attributes
// add the attributes
for (SortedMap.Entry instanceValue : instanceValues
.entrySet()) {
// row = instance number
wData.write(Integer.toString(row));
wData.write("\t");
// column = attribute index
wData.write(Integer.toString(instanceValue.getKey()));
wData.write("\t");
// value = value
// TODO fix me!
// instance id formatted as double
if (instanceValue.getKey() == 1) {
wData.write(Long.toString(instanceValue.getValue().longValue()));
} else {
wData.write(Double.toString(instanceValue.getValue()));
}
wData.write("\n");
}
}
/**
* export sparse matrix data for use in matlab/R. creates _data.txt with
* following columns:
*
* - row (int)
*
- column (int)
*
- cell value (double)
*
* also exports instance data (instance.txt). By default tab delimited
* without header. This can be read as a normal 3-column matrix into
* matlab/R, and then converted into a sparse matrix using
* Matrix::sparseMatrix (R) or sparse (matlab).
*/
protected void exportSparseMatrix(String filename, SparseData sparseData)
throws IOException {
BufferedWriter wData = null;
try {
wData = new BufferedWriter(new FileWriter(filename));
int row = 1;
for (long instanceId : sparseData.getInstanceIds()) {
exportSparseRow(sparseData, instanceId, wData, row);
row++;
}
} finally {
if (wData != null)
wData.close();
}
}
// protected List getInstanceIdsForScope(InstanceData
// instanceLabel,
// String label, Integer run, Integer fold) {
// List instanceIds = new ArrayList();
// SortedSet sortedInstanceIds = new TreeSet();
// if (label == null || label.length() == 0) {
// // add all instance ids
// for (SortedMap>>> runMap : instanceLabel.labelToInstanceMap
// .values()) {
// for (SortedMap>>
// foldMap : runMap
// .values()) {
// for (SortedMap> trainTestFold : foldMap
// .values()) {
// for (SortedMap trainMap : trainTestFold
// .values())
// sortedInstanceIds.addAll(trainMap.keySet());
// }
// }
// }
// } else if (label != null && label.length() > 0 && run == null) {
// // label scope
// }
// return instanceIds;
// }
/**
* get needed properties out of outdir. convert class names into integers
* for libsvm. attempt to parse the class name into an integer. if this
* fails, use an index that we increment. index corresponds to class name's
* alphabetical order.
*/
@Override
public void initializeExport(InstanceData instanceLabel,
Properties properties, SparseData sparseData) throws IOException {
this.exportProperties = properties;
this.outdir = properties.getProperty("outdir");
FileUtil.createOutdir(outdir);
kernelUtil.fillLabelToClassToIndexMap(
instanceLabel.getLabelToClassMap(), this.labelToClassIndexMap);
}
/**
* add the 'unlabeled' class id to the classIndexMap if it isn't there
* already
*/
protected void updateLabelClassMapTransductive() {
for (Map classIndexMap : labelToClassIndexMap.values()) {
classIndexMap.put("0", 0);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy