eu.fbk.twm.classifier.OneExamplePerSenseExtractor Maven / Gradle / Ivy
The newest version!
/*
* Copyright (2014) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package eu.fbk.twm.classifier;
import org.apache.log4j.Logger;
import eu.fbk.twm.utils.StringTable;
import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.concurrent.*;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: giuliano
* Date: 2/5/13
* Time: 2:23 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class OneExamplePerSenseExtractor {
/**
* Define a static logger variable so that it references the
* Logger instance named OneExamplePerSenseExtractor
.
*/
static Logger logger = Logger.getLogger(OneExamplePerSenseExtractor.class.getName());
protected int numForms;
protected int numThreads;
private ExecutorService myExecutor;
public final static int DEFAULT_THREADS_NUMBER = 1;
public final static int DEFAULT_NOTIFICATION_POINT = 100000;
public static final int PAGE_COLUMN_INDEX = 1;
public static final int DEFAULT_MINIMUM_FORM_FREQ = 1;
public static final int DEFAULT_MINIMUM_PAGE_FREQ = 1;
public static final boolean DEFAULT_NORMALIZE = false;
protected int notificationPoint;
PrintWriter senseWriter;
public static final int DEFAULT_NUM_FORMS = Integer.MAX_VALUE;
protected static Pattern tabPattern = Pattern.compile(StringTable.HORIZONTAL_TABULATION);
protected static Pattern spacePattern = Pattern.compile(StringTable.SPACE);
protected static DecimalFormat df = new DecimalFormat("###,###,###,###");
public final static int DEFAULT_QUEUE_SIZE = 10000;
private int minimumFormFreq;
private int minimumPageFreq;
protected boolean normalized;
protected int tfType;
protected OneExamplePerSenseExtractor(String outputFileName, int numThreads) throws IOException {
this(new File(outputFileName), numThreads);
}
public OneExamplePerSenseExtractor(File outputFile, int numThreads) throws IOException {
this.numThreads = numThreads;
normalized = DEFAULT_NORMALIZE;
minimumFormFreq = DEFAULT_MINIMUM_FORM_FREQ;
minimumPageFreq = DEFAULT_MINIMUM_PAGE_FREQ;
notificationPoint = DEFAULT_NOTIFICATION_POINT;
logger.info("creating the thread executor (" + numThreads + ")");
int blockQueueSize = DEFAULT_QUEUE_SIZE;
BlockingQueue blockingQueue = new ArrayBlockingQueue(blockQueueSize);
RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy();
myExecutor = new ThreadPoolExecutor(numThreads, numThreads, 1, TimeUnit.MINUTES, blockingQueue, rejectedExecutionHandler);
senseWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8")));
}
public int getTfType() {
return tfType;
}
public void setTfType(int tfType) {
this.tfType = tfType;
}
public void setNormalized(boolean normalized) {
this.normalized = normalized;
}
public boolean isNormalized() {
return normalized;
}
public int getMinimumFormFreq() {
return minimumFormFreq;
}
public void setMinimumFormFreq(int minimumFormFreq) {
this.minimumFormFreq = minimumFormFreq;
}
public int getMinimumPageFreq() {
return minimumPageFreq;
}
public void setMinimumPageFreq(int minimumPageFreq) {
this.minimumPageFreq = minimumPageFreq;
}
public int getNumForms() {
return numForms;
}
public void setNumForms(int numForms) {
this.numForms = numForms;
}
public int getNumThreads() {
return numThreads;
}
public void setNumThreads(int numThreads) {
this.numThreads = numThreads;
}
public int getNotificationPoint() {
return notificationPoint;
}
public void setNotificationPoint(int notificationPoint) {
this.notificationPoint = notificationPoint;
}
public void extract(String name) throws IOException {
extract(new File(name));
}
public void extract(File in) throws IOException {
logger.info("reading " + in + "...");
long begin = System.currentTimeMillis(), end = 0;
LineNumberReader lnr = new LineNumberReader(new InputStreamReader(new FileInputStream(in), "UTF-8"));
String line;
int count = 0, part = 0, tot = 0;
String previousForm = "";
//Map map = new HashMap();
String[] t = null;
List list = new ArrayList();
logger.info("totalFreq\tcount\ttime\tdate");
// read the first line
if ((line = lnr.readLine()) != null) {
try {
t = tabPattern.split(line);
if (t.length == eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.COLUMN_NUMBER) {
list.add(t);
previousForm = t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX];
//logger.info(part + "\t\"" + t[3] + "\"");
part++;
}
} catch (Exception e) {
logger.error("Error at line " + count);
logger.error(e);
} finally {
tot++;
}
}
// read the rest of the file
while ((line = lnr.readLine()) != null) {
if (count > numForms) {
logger.info("Exit after " + count + " forms (" + numForms + ")");
break;
}
try {
t = tabPattern.split(line);
if (t.length == eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.COLUMN_NUMBER) {
if (!t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX].equals(previousForm)) {
//logger.debug("executing " + previousForm + " (" + list.size() + ")...");
//todo: filter forms with less than minimumFormFreq
//todo: add topic label
myExecutor.execute(new ExampleBuilder(list, previousForm));
list = new ArrayList();
count++;
part = 0;
}
list.add(t);
previousForm = t[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.FORM_INDEX];
part++;
}
} catch (Exception e) {
logger.error("Error at line " + tot);
logger.error(e);
} finally {
tot++;
}
//if (count > 500) break;
if ((tot % notificationPoint) == 0) {
//senseWriter.flush();
end = System.currentTimeMillis();
logger.info(df.format(tot) + "\t" + df.format(count) + "\t" + df.format(end - begin) + "\t" + new Date());
begin = System.currentTimeMillis();
}
} // end while
lnr.close();
// add the last line
list.add(t);
logger.debug("executing " + previousForm + " (" + list.size() + ")...");
myExecutor.execute(new ExampleBuilder(list, previousForm));
end = System.currentTimeMillis();
logger.info(df.format(tot) + "\t" + df.format(count) + "\t" + df.format(end - begin) + "\t" + new Date());
try {
myExecutor.shutdown();
logger.info("waiting for execution...");
myExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
logger.error(e);
}
logger.info("closing the streams...");
senseWriter.close();
end();
logger.info("done it");
}
public abstract void buildExamples(Map> senseMap, String form);
public abstract void end();
/**
* This class is a factory for building the examples
*/
public class ExampleBuilder implements Runnable {
private List list;
private String form;
public ExampleBuilder(List list, String form) {
this.list = list;
this.form = form;
}
public void run() {
buildExamples(createSenseListMap(list), form);
}
}
/**
* Returns a map in which the keys are senses and values are senseList of examples.
*
* @return a map in which the keys are senses and values are senseList of examples.
*/
Map> createSenseListMap(List senseList) {
Map> map = new HashMap>();
String[] line;
String key;
for (int i = 0; i < senseList.size(); i++) {
line = senseList.get(i);
key = line[eu.fbk.twm.index.csv.OneExamplePerSenseExtractor.PAGE_INDEX];
List list = map.get(key);
if (list == null) {
list = new ArrayList();
map.put(key, list);
}
list.add(line);
}
return map;
}
}