lv.lnb.ner.tagFolders Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2012 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.lnb.ner;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.*;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import java.util.ArrayList;
import java.util.List;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
public class tagFolders {
static String serializedClassifier = "/Users/pet/Dropbox/NER/stanford-ner-2012-04-07/lv-ner-model.ser.gz";
public static void main(String[] args) throws IOException {
if (args.length > 0) {
serializedClassifier = args[0];
}
//String startFolder = "/Users/pet/Documents/LNB_converted/01-03-01";
String startFolder = "/Users/pet/Documents/LNB_converted";
int total = walk(startFolder, null, null);
System.out.print(total);
}
private static int processFile(
AbstractSequenceClassifier classifier,
String filename,
String doc_id,
Writer writer) {
NECounterSingleDoc counter = new NECounterSingleDoc(doc_id);
List document = new ArrayList();
int i = 0;
try {
BufferedReader ieeja = new BufferedReader( new InputStreamReader(new FileInputStream(filename), "UTF-8"));
String line;
while ((line = ieeja.readLine()) != null) {
i++;
if (line.contains("") || line.contains("") ||line.contains("")) continue;
String[] info = line.split("\t");
CoreLabel word = new CoreLabel();
if (line.contains("
")) {
word.set(TextAnnotation.class, "");
word.set(LemmaAnnotation.class, "");
word.set(PartOfSpeechAnnotation.class, "-");
continue;
} else if (line.contains(" ")) {
word.set(TextAnnotation.class, " ");
word.set(LemmaAnnotation.class, " ");
word.set(PartOfSpeechAnnotation.class, "-");
continue;
} else if (info.length<3) {
System.err.printf("%d @ %s:%s", i, filename, line);
} else {
word.set(TextAnnotation.class, info[0]);
word.set(LemmaAnnotation.class, info[1]);
word.set(PartOfSpeechAnnotation.class, info[2].substring(0, 1));
}
document.add(word);
}
ieeja.close();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
List out = classifier.classify(document);
String prevtag = "";
String name_part = "";
String lemma_part = "";
for (CoreLabel word : out) {
String tag = word.get(AnswerAnnotation.class);
if (tag.length()<2) tag = "";
if (!tag.equalsIgnoreCase(prevtag)) {
if (!prevtag.equalsIgnoreCase("")) counter.add(doc_id, name_part, lemma_part, prevtag);
if (!tag.equalsIgnoreCase("")) {
name_part = word.word();
lemma_part = word.get(LemmaAnnotation.class);
}
} else if (!tag.equalsIgnoreCase("")) {
name_part = name_part + " " + word.word();
lemma_part = lemma_part + " " + word.get(LemmaAnnotation.class);
}
prevtag = tag;
}
try {
counter.db_insert(writer);
} catch (Exception e) {
e.printStackTrace();
}
return i;
}
private static int walk( String path,
AbstractSequenceClassifier parentclassifier,
Writer writer
) throws IOException {
File root = new File( path );
File[] list = root.listFiles();
int i = 0;
AbstractSequenceClassifier classifier = parentclassifier;
if (classifier == null && !path.endsWith("converted")) classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
for ( File f : list ) {
if (f.getName().startsWith(".")) continue;
if ( f.isDirectory() ) {
Writer straume;
if (writer == null) {
straume = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f.getAbsolutePath() + "/entities.sql"), "UTF-8"));
straume.write("-- " + f.getAbsolutePath() + "\n");
} else straume = writer;
long sākums = System.currentTimeMillis();
int words = walk( f.getAbsolutePath(), classifier, straume );
i += words;
if (writer == null) {
long beigas = System.currentTimeMillis();
System.out.printf( "Finished %s : %,d k-words, %,d wps\n", f.getAbsoluteFile(), i/1000, words*1000/(beigas-sākums));
straume.close();
}
}
else {
i += processFile(classifier, f.getAbsoluteFile().toString(), f.getAbsoluteFile().getParentFile().getName(), writer);
}
}
return i;
}
}