All Downloads are FREE. Search and download functionalities are using the official Maven repository.

lv.lnb.ner.tagFolders Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2012 Institute of Mathematics and Computer Science, University of Latvia
 * Author: Pēteris Paikens
 * 
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 * 
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 * 
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see .
 *******************************************************************************/
package lv.lnb.ner;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.*;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;

import java.util.ArrayList;
import java.util.List;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;

public class tagFolders {
	static String serializedClassifier = "/Users/pet/Dropbox/NER/stanford-ner-2012-04-07/lv-ner-model.ser.gz";
	
    public static void main(String[] args) throws IOException {

      if (args.length > 0) {
        serializedClassifier = args[0];
      }

      //String startFolder = "/Users/pet/Documents/LNB_converted/01-03-01";
      String startFolder = "/Users/pet/Documents/LNB_converted";
      
      int total = walk(startFolder, null, null);
      System.out.print(total);
    }

	private static int processFile(
			AbstractSequenceClassifier classifier,
			String filename,
			String doc_id,
			Writer writer) {

		NECounterSingleDoc counter = new NECounterSingleDoc(doc_id);	
		
		List document = new ArrayList(); 
		int i = 0;
		try {
			BufferedReader ieeja = new BufferedReader(	new InputStreamReader(new FileInputStream(filename), "UTF-8"));
			String line;
			while ((line = ieeja.readLine()) != null) {
				i++;
				if (line.contains("") || line.contains("") ||line.contains("

")) continue; String[] info = line.split("\t"); CoreLabel word = new CoreLabel(); if (line.contains("

")) { word.set(TextAnnotation.class, "

"); word.set(LemmaAnnotation.class, "

"); word.set(PartOfSpeechAnnotation.class, "-"); continue; } else if (line.contains("")) { word.set(TextAnnotation.class, ""); word.set(LemmaAnnotation.class, ""); word.set(PartOfSpeechAnnotation.class, "-"); continue; } else if (info.length<3) { System.err.printf("%d @ %s:%s", i, filename, line); } else { word.set(TextAnnotation.class, info[0]); word.set(LemmaAnnotation.class, info[1]); word.set(PartOfSpeechAnnotation.class, info[2].substring(0, 1)); } document.add(word); } ieeja.close(); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } List out = classifier.classify(document); String prevtag = ""; String name_part = ""; String lemma_part = ""; for (CoreLabel word : out) { String tag = word.get(AnswerAnnotation.class); if (tag.length()<2) tag = ""; if (!tag.equalsIgnoreCase(prevtag)) { if (!prevtag.equalsIgnoreCase("")) counter.add(doc_id, name_part, lemma_part, prevtag); if (!tag.equalsIgnoreCase("")) { name_part = word.word(); lemma_part = word.get(LemmaAnnotation.class); } } else if (!tag.equalsIgnoreCase("")) { name_part = name_part + " " + word.word(); lemma_part = lemma_part + " " + word.get(LemmaAnnotation.class); } prevtag = tag; } try { counter.db_insert(writer); } catch (Exception e) { e.printStackTrace(); } return i; } private static int walk( String path, AbstractSequenceClassifier parentclassifier, Writer writer ) throws IOException { File root = new File( path ); File[] list = root.listFiles(); int i = 0; AbstractSequenceClassifier classifier = parentclassifier; if (classifier == null && !path.endsWith("converted")) classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); for ( File f : list ) { if (f.getName().startsWith(".")) continue; if ( f.isDirectory() ) { Writer straume; if (writer == null) { straume = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f.getAbsolutePath() + "/entities.sql"), "UTF-8")); straume.write("-- " + f.getAbsolutePath() + "\n"); } else straume = writer; long sākums = System.currentTimeMillis(); int words = walk( f.getAbsolutePath(), classifier, straume ); i += words; if (writer == null) { long beigas = System.currentTimeMillis(); System.out.printf( "Finished %s : %,d k-words, %,d wps\n", f.getAbsoluteFile(), i/1000, words*1000/(beigas-sākums)); straume.close(); } } else { i += processFile(classifier, f.getAbsoluteFile().toString(), f.getAbsoluteFile().getParentFile().getName(), writer); } } return i; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy