org.itc.irst.tcc.sre.CreateTrainingSet Maven / Gradle / Ivy
/*
* Copyright 2005 FBK-irst (http://www.fbk.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.itc.irst.tcc.sre;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Iterator;
import java.util.Properties;
import org.itc.irst.tcc.sre.data.ArgumentSet;
import org.itc.irst.tcc.sre.data.ExampleSet;
import org.itc.irst.tcc.sre.data.SentenceSetCopy;
import org.itc.irst.tcc.sre.kernel.expl.AbstractMapping;
import org.itc.irst.tcc.sre.kernel.expl.ContextMappingFactory;
import org.itc.irst.tcc.sre.util.FeatureIndex;
import org.itc.irst.tcc.sre.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* TO DO
*
* @author Claudio Giuliano
* @version %I%, %G%
* @since 1.0
*/
public class CreateTrainingSet
{
/**
* Define a static logger variable so that it references the
* Logger instance named CreateTrainingSet
.
*/
static Logger logger = LoggerFactory.getLogger(CreateTrainingSet.class.getName());
//
public static final int MAX_NUMBER_OF_CLASSES = 20;
//
private Properties parameter;
//
public CreateTrainingSet(Properties parameter)
{
this.parameter = parameter;
} // end constructor
//
public void run() throws Exception
{
logger.info("create a train set for relation extraction");
//
String outputDir = parameter.getProperty("output-dir");
if (!outputDir.endsWith(File.separator))
outputDir += File.separator;
if (!new File(outputDir).mkdir())
{
logger.error("cannot create dir " + outputDir);
System.exit(-1);
}
// read data set
//ExampleSet inputSet = readDataSet(parameter.inputFile());
File inputFile = new File(parameter.getProperty("example-file"));
ExampleSet inputSet = readDataSet(inputFile);
logger.info("input training set size: " + inputSet.size());
// get the class freq
int[] freq = classFreq(inputSet);
// calculate the class weight
double[] weight = classWeigth(freq);
// find argument types
ArgumentSet.getInstance().init(inputSet);
// create the mapping factory
//AbstractMapping mapping = mappingFactory();
ContextMappingFactory contextMappingFactory = ContextMappingFactory.getContextMappingFactory();
AbstractMapping contextMapping = contextMappingFactory.getInstance(parameter.getProperty("kernel-type"));
// set the command line parameters
contextMapping.setParameters(parameter);
// get the number of subspaces
int subspaceCount = contextMapping.subspaceCount();
logger.debug("number of subspaces: " + subspaceCount);
// create the index
FeatureIndex[] index = createFeatureIndex(subspaceCount);
// embed the input data into a feature space
logger.info("embed the training set");
ExampleSet outputSet = contextMapping.map(inputSet, index);
logger.debug("embedded training set size: " + outputSet.size());
// if not specified, calculate SVM parameter C
double c = calculateC(outputSet);
logger.info("cost parameter C = " + c);
// save the training set
File training = saveExampleSet(outputSet, outputDir);
// save the indexes
saveFeatureIndexes(index, outputDir);
// save param
saveParameters(outputDir);
// save command line
saveCommandLine(outputDir, c, weight);
} // end run
// calculate parameter C of SVM
//
// To allow some flexibility in separating the categories,
// SVM models have a cost parameter, C, that controls the
// trade off between allowing training errors and forcing
// rigid margins. It creates a soft margin that permits
// some misclassifications. Increasing the value of C
// increases the cost of misclassifying points and forces
// the creation of a more accurate model that may not
// generalize well
private double calculateC(ExampleSet data) //throws Exception
{
String svmCost = parameter.getProperty("svm-cost");
if (svmCost != null)
return Integer.parseInt(svmCost);
logger.info("calculate default SVM cost parameter C");
//double c = 1;
double avr = 0;
// the example set is normalized
// all vectors have the same norm
for (int i=0;i file with training data (SRE format)\n");
sb.append("\toutput-dir\t-> directory in which to store resulting files\n");
sb.append("Options:\n");
sb.append("\t-h\t\t-> this help\n");
sb.append("\t-k string\t-> set type of kernel function (default SL):\n");
sb.append("\t\t\t\tLC: Local Context Kernel\n");
sb.append("\t\t\t\tGC: Global Context Kernel\n");
sb.append("\t\t\t\tSL: Shallow Linguistic Context Kernel\n");
sb.append("\t-n [1..]\t-> set the parameter n-gram of kernels SL and GC (default 3)\n");
sb.append("\t-w [0..]\t-> set the window size of kernel LC (default 2)\n");
//sb.append("\t-c [0..]\t-> set the trade-off between training error and margin (default 1/[avg. x*x'])\n");
sb.append("\t-f\t-> fraction of training set (default 1)\n");
sb.append("\t-m int\t\t-> set cache memory size in MB (default 128)\n");
return sb.toString();
} // end getHelp
// //
// public static void main(String args[]) throws Exception
// {
// String logConfig = System.getProperty("log-config");
// if (logConfig == null)
// logConfig = "log-config.txt";
//
// PropertyConfigurator.configure(logConfig);
//
// Properties parameter = new Properties();
// parameter.setProperty("cache-size", "128");
// parameter.setProperty("kernel-type", "SL");
// parameter.setProperty("n-gram", "3");
// parameter.setProperty("window-size", "2");
// //parameter.setProperty("use-tf", "false");
// //parameter.setProperty("stemmer-type", "null");
// //parameter.setProperty("svm-cost", "-1");
//
// if (args.length < 2)
// {
// System.err.println(getHelp());
// System.exit(-1);
// }
//
// parameter.setProperty("example-file", args[args.length - 2]);
// parameter.setProperty("output-dir", args[args.length - 1]);
//
//
// // set parameters
// for (int i=0;i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy