Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*******************************************************************************
* Copyright 2012 Institute of Mathematics and Computer Science, University of Latvia
* Author: Pēteris Paikens
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*******************************************************************************/
package lv.lnb.ner;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import lv.semti.morphology.analyzer.Analyzer;
import lv.semti.morphology.analyzer.Splitting;
import lv.semti.morphology.analyzer.Word;
import lv.semti.morphology.analyzer.Wordform;
import lv.semti.morphology.attributes.AttributeNames;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.IntCounter;
public class MergeEntityInformation {
static int filter_floor = 10; // vismaz cik pieminējumiem jābūt, lai iekļautu entīti sarakstā
static Counter counter = new IntCounter();
static Counter counterbydoc = new IntCounter();
static HashMap> popular_forms = new HashMap>();
static HashMap best_forms = new HashMap();
static HashMap blacklist = new HashMap();
static HashMap doc_ids = new HashMap();
static HashMap name_ids = new HashMap();
public static void main(String[] args) throws Exception {
String all_entity_file = "/Users/pet/Documents/LNB_converted/all_entities.txt";
String doc_id_file = "/Users/pet/Dropbox/LNB/data/data_LNB_documents.txt";
String name_id_file = "/Users/pet/Dropbox/LNB/data/data_sep_LNB_names2.txt";
String entity_doc_file = "/Users/pet/Documents/LNB_converted/entities_documents.sql";
String new_name_file = "/Users/pet/Documents/LNB_converted/entities.sql";
String src = "PP120924";
String line;
BufferedReader liste = new BufferedReader( new InputStreamReader(new FileInputStream("NERdicts/Forenames.txt"), "UTF-8"));
while ((line = liste.readLine()) != null) blacklist.put(line, line);
liste.close();
liste = new BufferedReader( new InputStreamReader(new FileInputStream("NERdicts/blacklist.txt"), "UTF-8"));
while ((line = liste.readLine()) != null) blacklist.put(line, line);
liste.close();
liste = new BufferedReader( new InputStreamReader(new FileInputStream(doc_id_file), "UTF-8"));
while ((line = liste.readLine()) != null) {
if (line.startsWith("id")) continue;
String[] info = line.split("\t");
if (info.length<2) {
System.err.printf("Can't split doc id fields: %s", line);
} else {
String id = info[0];
String foldername = info[1];
doc_ids.put(foldername, id);
}
}
liste.close();
liste = new BufferedReader( new InputStreamReader(new FileInputStream(name_id_file), "UTF-8"));
while ((line = liste.readLine()) != null) {
if (line.startsWith("ID")) continue;
String[] info = line.split("\t");
if (info.length<2) {
System.err.printf("Can't split doc id fields: %s", line);
} else {
String id = info[0];
String name = info[1].replace("\"","");
name_ids.put(name, id);
}
}
liste.close();
BufferedReader ieeja = new BufferedReader( new InputStreamReader(new FileInputStream(all_entity_file), "UTF-8"));
int i=0;
while ((line = ieeja.readLine()) != null) {
if (line.startsWith("--") ) continue;
//if (i>=100000) break;
i++;
String[] info = line.split("\t");
if (info.length<5) {
System.err.printf("%d @ %s", i, line);
} else {
String word = info[0];
String normalform = info[1];
String category = info[2];
String doc = info[3];
int mention_count = Integer.parseInt(info[4]);
add_word(word.trim(), normalform.trim(), category, doc, mention_count);
//System.out.printf("%s:%d\n", word, mention_count);
}
}
ieeja.close();
System.out.printf("Apskatījām %d ierakstus, %d dažādi.\n", i, counter.size());
Counters.retainAbove(counter, filter_floor);
System.out.printf("\t%d no tiem vismaz %d reizes.\n\n", counter.size(), filter_floor);
Analyzer analyzer = new Analyzer("dist/Lexicon.xml");
for (Entry entry: counter.entrySet()) {
String key = entry.getKey();
Counter forms = popular_forms.get(key);
String best_form = Counters.argmax(forms);
String[] info = key.split(Pattern.quote("|"));
String category = info[0];
best_form = normalizeForm(best_form, category, analyzer);
best_forms.put(key, best_form);
//System.out.printf("%s sastopams %d reizes.\n", best_form, entry.getValue().intValue());
}
BufferedWriter izeja = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(entity_doc_file), "UTF-8"));
BufferedWriter jaunie_vārdi = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new_name_file), "UTF-8"));
for (Entry entry: counterbydoc.entrySet()) {
String[] info = entry.getKey().split(Pattern.quote("|"));
if (info.length<3) {
System.err.printf("%d @ %s", i, line);
} else {
String normalform = info[1];
String category = info[0];
String doc = doc_ids.get(info[2]);
if (doc == null) System.err.println("bad docid: " + info[2]);
String key = category+"|"+normalform;
String best_form = best_forms.get(key);
if (best_form != null) {
String name_id = name_ids.get(best_form);
if (name_id == null) {
String insert = String.format("insert into entity (categoryid, definition, infoSource) values (%s, \"%s\", \"%s\");\n", category, best_form, src);
insert += String.format("insert into name (name, infoSource) values (\"%s\", \"%s\");\n", best_form, src);
insert += String.format("insert into entityName(nameID, entityID, infoSource) select LAST_INSERT_ID() as nameID, ID as entityID, \"%s\" from entity where definition=\"%s\" and infoSource=\"%s\";\n",src,best_form,src);
jaunie_vārdi.append(String.format("%s\n", insert));
name_id = "1234567890";
name_ids.put(best_form, name_id);
}
izeja.append(String.format("insert into nameDocument (nameID, documentID, occurrences, infoSource) values (%s, %s, %d, \"%s\");\n", name_id, doc, entry.getValue().intValue(), src));
//izeja.append(String.format("%s\t%s\t%s\t%s\t%d\t%s\n", best_form, normalform, category, doc, entry.getValue().intValue(),name_id));
}
}
}
jaunie_vārdi.flush();
jaunie_vārdi.close();
izeja.flush();
izeja.close();
}
private static void add_word(String word, String normalform,
String category, String doc, int mention_count) {
if (word.trim().length()<3) return;
if (word.contains(",")) {
String[] wordparts = word.split(",");
String[] formparts = normalform.split(",");
if (wordparts.length != formparts.length) {
System.err.println(String.format("Nesakrīt komatu skaits '%s' un '%s'.", word, normalform));
return;
}
for (int i=0; i 50) return;
String key = category+"|"+normalform;
counter.incrementCount(key, mention_count);
Counter forms = popular_forms.get(key);
if (forms == null) {
forms = new IntCounter();
popular_forms.put(key, forms);
}
forms.incrementCount(word, mention_count);
counterbydoc.incrementCount(key+"|"+doc, mention_count);
}
static String normalizeForm(String form, String category, Analyzer analyzer) {
String result = form;
List words = Splitting.tokenize(analyzer, result);
Word lastword = words.get(words.size()-1);
Wordform bestform = lastword.getBestWordform();
if (bestform != null && (bestform.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Genitive) || bestform.isMatchingStrong(AttributeNames.i_Case, AttributeNames.v_Locative))) {
//System.out.printf("'%s' -> '%s'\t%s\n", lastword.getToken(), bestform.getValue(AttributeNames.i_Lemma), result);
boolean propername = Character.isUpperCase(lastword.getToken().charAt(0));
String replacement = bestform.getValue(AttributeNames.i_Lemma);
if (propername) replacement = NE.capitalizeString(replacement);
result = result.replace(lastword.getToken(), replacement);
}
if (result.endsWith(" kaps")) result = result.replace(" kaps", " kapi");
if (!result.equalsIgnoreCase(form)) {
//System.out.printf("'%s' -> '%s'\t%s\n", form, result, category);
}
//if (!result.contains(" ") && !category.equalsIgnoreCase("4"))
if (result.contains("\\") || result.contains("\""))
//System.out.printf("'%s'\t%s\n", result, category);
System.err.printf("%s\n", result, category);
return result;
}
}