All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.classifier.OneExamplePerSenseExtractor Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2014) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.classifier;

import org.apache.log4j.Logger;
import eu.fbk.twm.utils.StringTable;

import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 2/5/13
 * Time: 2:23 PM
 * To change this template use File | Settings | File Templates.
 */
public abstract class OneExamplePerSenseExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named OneExamplePerSenseExtractor.
	 */
	static Logger logger = Logger.getLogger(OneExamplePerSenseExtractor.class.getName());

	protected int numForms;

	protected int numThreads;

	private ExecutorService myExecutor;

	public final static int DEFAULT_THREADS_NUMBER = 1;

	public final static int DEFAULT_NOTIFICATION_POINT = 100000;

	public static final int PAGE_COLUMN_INDEX = 1;

	public static final int DEFAULT_MINIMUM_FORM_FREQ = 1;

	public static final int DEFAULT_MINIMUM_PAGE_FREQ = 1;

	public static final boolean DEFAULT_NORMALIZE = false;

	protected int notificationPoint;

	PrintWriter senseWriter;

	public static final int DEFAULT_NUM_FORMS = Integer.MAX_VALUE;

	protected static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);

	protected static Pattern spacePattern = Pattern.compile(StringTable.SPACE);

	protected static DecimalFormat df = new DecimalFormat("###,###,###,###");

	public final static int DEFAULT_QUEUE_SIZE = 10000;

	private int minimumFormFreq;

	private int minimumPageFreq;

	protected boolean normalized;

	protected int tfType;

	protected OneExamplePerSenseExtractor(String outputFileName, int numThreads) throws IOException {
		this(new File(outputFileName), numThreads);
	}

	public OneExamplePerSenseExtractor(File outputFile, int numThreads) throws IOException {
		this.numThreads = numThreads;
		normalized = DEFAULT_NORMALIZE;
		minimumFormFreq = DEFAULT_MINIMUM_FORM_FREQ;
		minimumPageFreq = DEFAULT_MINIMUM_PAGE_FREQ;
		notificationPoint = DEFAULT_NOTIFICATION_POINT;
		logger.info("creating the thread executor (" + numThreads + ")");
		int blockQueueSize = DEFAULT_QUEUE_SIZE;
		BlockingQueue blockingQueue = new ArrayBlockingQueue(blockQueueSize);
		RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy();
		myExecutor = new ThreadPoolExecutor(numThreads, numThreads, 1, TimeUnit.MINUTES, blockingQueue, rejectedExecutionHandler);
		senseWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")));
	}

	public int getTfType() {
		return tfType;
	}

	public void setTfType(int tfType) {
		this.tfType = tfType;
	}

	public void setNormalized(boolean normalized) {
		this.normalized = normalized;
	}

	public boolean isNormalized() {
		return normalized;
	}

	public int getMinimumFormFreq() {
		return minimumFormFreq;
	}

	public void setMinimumFormFreq(int minimumFormFreq) {
		this.minimumFormFreq = minimumFormFreq;
	}

	public int getMinimumPageFreq() {
		return minimumPageFreq;
	}

	public void setMinimumPageFreq(int minimumPageFreq) {
		this.minimumPageFreq = minimumPageFreq;
	}

	public int getNumForms() {
		return numForms;
	}

	public void setNumForms(int numForms) {
		this.numForms = numForms;
	}

	public int getNumThreads() {
		return numThreads;
	}

	public void setNumThreads(int numThreads) {
		this.numThreads = numThreads;
	}

	public int getNotificationPoint() {
		return notificationPoint;
	}

	public void setNotificationPoint(int notificationPoint) {
		this.notificationPoint = notificationPoint;
	}

	public void extract(String name) throws IOException {
		extract(new File(name));
	}

	public void extract(File in) throws IOException {
		logger.info("reading " + in + "...");

		long begin = System.currentTimeMillis(), end = 0;
		LineNumberReader lnr = new LineNumberReader(new InputStreamReader(new FileInputStream(in), "UTF-8"));
		String line;
		int count = 0, part = 0, tot = 0;
		String previousForm = "";
		//Map map = new HashMap();
		String[] t = null;
		List list = new ArrayList();
		logger.info("totalFreq\tcount\ttime\tdate");
		// read the first line
		if ((line = lnr.readLine()) != null) {
			try {
				t = tabPattern.split(line);

				if (t.length == eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.COLUMN_NUMBER) {
					list.add(t);
					previousForm = t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX];
					//logger.info(part + "\t\"" + t[3] + "\"");
					part++;
				}
			} catch (Exception e) {
				logger.error("Error at line " + count);
				logger.error(e);
			} finally {
				tot++;
			}
		}

		// read the rest of the file
		while ((line = lnr.readLine()) != null) {

			if (count > numForms) {
				logger.info("Exit after " + count + " forms (" + numForms + ")");
				break;
			}
			try {
				t = tabPattern.split(line);
				if (t.length == eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.COLUMN_NUMBER) {
					if (!t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX].equals(previousForm)) {
						//logger.debug("executing " + previousForm + " (" + list.size() + ")...");
						//todo: filter forms with less than minimumFormFreq
						//todo: add topic label
						myExecutor.execute(new ExampleBuilder(list, previousForm));
						list = new ArrayList();
						count++;
						part = 0;
					}
					list.add(t);
					previousForm = t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX];
					part++;
				}
			} catch (Exception e) {
				logger.error("Error at line " + tot);
				logger.error(e);
			} finally {
				tot++;
			}
			//if (count > 500) break;

			if ((tot % notificationPoint) == 0) {
				//senseWriter.flush();
				end = System.currentTimeMillis();
				logger.info(df.format(tot) + "\t" + df.format(count) + "\t" + df.format(end - begin) + "\t" + new Date());
				begin = System.currentTimeMillis();
			}
		} // end while
		lnr.close();

		// add the last line
		list.add(t);
		logger.debug("executing " + previousForm + " (" + list.size() + ")...");
		myExecutor.execute(new ExampleBuilder(list, previousForm));

		end = System.currentTimeMillis();
		logger.info(df.format(tot) + "\t" + df.format(count) + "\t" + df.format(end - begin) + "\t" + new Date());

		try {
			myExecutor.shutdown();
			logger.info("waiting for execution...");
			myExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
		} catch (InterruptedException e) {
			logger.error(e);
		}
		logger.info("closing the streams...");

		senseWriter.close();
		end();
		logger.info("done it");
	}

	public abstract void buildExamples(Map> senseMap, String form);


	public abstract void end();

	/**
	 * This class is a factory for building the examples
	 */
	public class ExampleBuilder implements Runnable {
		private List list;
		private String form;

		public ExampleBuilder(List list, String form) {
			this.list = list;
			this.form = form;
		}

		public void run() {
			buildExamples(createSenseListMap(list), form);
		}
	}

	/**
	 * Returns a map in which the keys are senses and values are senseList of examples.
	 *
	 * @return a map in which the keys are senses and values are senseList of examples.
	 */
	Map> createSenseListMap(List senseList) {
		Map> map = new HashMap>();
		String[] line;
		String key;
		for (int i = 0; i < senseList.size(); i++) {
			line = senseList.get(i);
			key = line[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.PAGE_INDEX];
			List list = map.get(key);
			if (list == null) {
				list = new ArrayList();
				map.put(key, list);
			}
			list.add(line);
		}
		return map;
	}
}





© 2015 - 2024 Weber Informatics LLC | Privacy Policy