All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.itc.irst.tcc.sre.CreateTrainingSet Maven / Gradle / Ivy

/*
 * Copyright 2005 FBK-irst (http://www.fbk.eu)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.itc.irst.tcc.sre;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.Iterator;
import java.util.Properties;

import org.itc.irst.tcc.sre.data.ArgumentSet;
import org.itc.irst.tcc.sre.data.ExampleSet;
import org.itc.irst.tcc.sre.data.SentenceSetCopy;
import org.itc.irst.tcc.sre.kernel.expl.AbstractMapping;
import org.itc.irst.tcc.sre.kernel.expl.ContextMappingFactory;
import org.itc.irst.tcc.sre.util.FeatureIndex;
import org.itc.irst.tcc.sre.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;


/**
 * TO DO
 *
 * @author 	Claudio Giuliano
 * @version %I%, %G%
 * @since		1.0
 */
public class CreateTrainingSet
{
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named CreateTrainingSet.
	 */
	static Logger logger = LoggerFactory.getLogger(CreateTrainingSet.class.getName());

	//
	public static final int MAX_NUMBER_OF_CLASSES = 20;

	//
	private Properties parameter;

	//
	public CreateTrainingSet(Properties parameter)
	{
		this.parameter = parameter;
	} // end constructor

	//
	public void run() throws Exception
	{
		logger.info("create a train set for relation extraction");

		//
		String outputDir = parameter.getProperty("output-dir");
		if (!outputDir.endsWith(File.separator))
			outputDir += File.separator;

		if (!new File(outputDir).mkdir())
		{
			logger.error("cannot create dir " + outputDir);
			System.exit(-1);
		}

		// read data set
		//ExampleSet inputSet = readDataSet(parameter.inputFile());
		File inputFile = new File(parameter.getProperty("example-file"));
		ExampleSet inputSet = readDataSet(inputFile);
		logger.info("input training set size: " + inputSet.size());

		// get the class freq
		int[] freq = classFreq(inputSet);

		// calculate the class weight
		double[] weight = classWeigth(freq);

		// find argument types
		ArgumentSet.getInstance().init(inputSet);

		// create the mapping factory
		//AbstractMapping mapping = mappingFactory();
		ContextMappingFactory contextMappingFactory = ContextMappingFactory.getContextMappingFactory();
		AbstractMapping contextMapping = contextMappingFactory.getInstance(parameter.getProperty("kernel-type"));
		// set the command line parameters
		contextMapping.setParameters(parameter);

		// get the number of subspaces
		int subspaceCount = contextMapping.subspaceCount();
		logger.debug("number of subspaces: " + subspaceCount);

		// create the index
		FeatureIndex[] index = createFeatureIndex(subspaceCount);

		// embed the input data into a feature space
		logger.info("embed the training set");
		ExampleSet outputSet = contextMapping.map(inputSet, index);
		logger.debug("embedded training set size: " + outputSet.size());

		// if not specified, calculate SVM parameter C
		double c = calculateC(outputSet);
		logger.info("cost parameter C = " + c);

		// save the training set
		File training = saveExampleSet(outputSet, outputDir);

		// save the indexes
		saveFeatureIndexes(index, outputDir);

		// save param
		saveParameters(outputDir);

		// save command line
		saveCommandLine(outputDir, c, weight);
	} // end run

	// calculate parameter C of SVM
	//
	// To allow some flexibility in separating the categories,
	// SVM models have a cost parameter, C, that controls the
	// trade off between allowing training errors and forcing
	// rigid margins. It creates a soft margin that permits
	// some misclassifications. Increasing the value of C
	// increases the cost of misclassifying points and forces
	// the creation of a more accurate model that may not
	// generalize well
	private double calculateC(ExampleSet data) //throws Exception
	{
		String svmCost = parameter.getProperty("svm-cost");
		if (svmCost != null)
			return Integer.parseInt(svmCost);

		logger.info("calculate default SVM cost parameter C");

		//double c = 1;
		double avr = 0;

		// the example set is normalized
		// all vectors have the same norm
		for (int i=0;i file with training data (SRE format)\n");
		sb.append("\toutput-dir\t-> directory in which to store resulting files\n");

		sb.append("Options:\n");
		sb.append("\t-h\t\t-> this help\n");
		sb.append("\t-k string\t-> set type of kernel function (default SL):\n");
		sb.append("\t\t\t\tLC: Local Context Kernel\n");
		sb.append("\t\t\t\tGC: Global Context Kernel\n");
		sb.append("\t\t\t\tSL: Shallow Linguistic Context Kernel\n");

		sb.append("\t-n [1..]\t-> set the parameter n-gram of kernels SL and GC  (default 3)\n");
		sb.append("\t-w [0..]\t-> set the window size of kernel LC (default 2)\n");
		//sb.append("\t-c [0..]\t-> set the trade-off between training error and margin (default 1/[avg. x*x'])\n");

		sb.append("\t-f\t-> fraction of training set (default 1)\n");
		sb.append("\t-m int\t\t-> set cache memory size in MB (default 128)\n");

		return sb.toString();
	} // end getHelp

//	//
//	public static void main(String args[]) throws Exception
//	{
//		String logConfig = System.getProperty("log-config");
//		if (logConfig == null)
//			logConfig = "log-config.txt";
//
//		PropertyConfigurator.configure(logConfig);
//
//		Properties parameter = new Properties();
//		parameter.setProperty("cache-size", "128");
//		parameter.setProperty("kernel-type", "SL");
//		parameter.setProperty("n-gram", "3");
//		parameter.setProperty("window-size", "2");
//		//parameter.setProperty("use-tf", "false");
//		//parameter.setProperty("stemmer-type", "null");
//		//parameter.setProperty("svm-cost", "-1");
//
//		if (args.length < 2)
//		{
//			System.err.println(getHelp());
//			System.exit(-1);
//		}
//
//		parameter.setProperty("example-file", args[args.length - 2]);
//		parameter.setProperty("output-dir", args[args.length - 1]);
//
//
//		// set parameters
//		for (int i=0;i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy