All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.dh.kd.Main Maven / Gradle / Ivy

The newest version!
package eu.fbk.dh.kd;

import com.google.common.base.Joiner;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import eu.fbk.dh.kd.lib.KD_configuration;
import eu.fbk.dh.kd.lib.KD_configuration.ColumExtraction;
import eu.fbk.dh.kd.lib.KD_configuration.Group;
import eu.fbk.dh.kd.lib.KD_core;
import eu.fbk.dh.kd.lib.KD_core.Language;
import eu.fbk.dh.kd.lib.KD_core.Threads;
import eu.fbk.dh.kd.lib.KD_keyconcept;
import eu.fbk.dh.kd.lib.KD_loader;
import eu.fbk.dh.kd.models.KD_Model;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;


/**
 * eu.fbk.dh.kd.Main runnable class
 * Please refer to the help for more information about the parameters
 *
 * @author Giovanni Moretti - DH Group FBK.
 * @version $Id: $Id
 */
public class Main {

    /**
     * 

main.

* * @param args an array of {@link java.lang.String} objects. */ @SuppressWarnings("static-access") public static void main(String[] args) { KD_configuration configuration = new KD_configuration(); configuration.numberOfConcepts = -1; configuration.max_keyword_length = 4; configuration.local_frequency_threshold = 2; configuration.prefer_specific_concept = KD_configuration.Prefer_Specific_Concept.MEDIUM; configuration.skip_proper_noun = false; configuration.skip_keyword_with_proper_noun = false; configuration.rerank_by_position = false; configuration.group_by = KD_configuration.Group.NONE; configuration.column_configuration = KD_configuration.ColumExtraction.TOKEN_POS_LEMMA; configuration.only_multiword = false; configuration.tagset = KD_configuration.Tagset.TEXTPRO; Language lang = Language.ENGLISH; int cores = Runtime.getRuntime().availableProcessors(); boolean useStanford = false; boolean save_stanford = false; Threads t; switch (cores) { case 1: t = Threads.ONE; break; case 2: t = Threads.TWO; break; case 4: t = Threads.FOUR; break; case 6: t = Threads.SIX; break; case 8: t = Threads.EIGHT; break; case 10: t = Threads.TEN; break; case 12: t = Threads.TWELVE; break; default: t = Threads.TWO; break; } boolean STDOUT = false; ///////////////////////////////// command line parser ////////////////////////////////////// CommandLineParser parser = new PosixParser(); Options options = new Options(); //options.addOption( "n", "number_of_concept", false, "do not hide entries starting with ." ); options.addOption(OptionBuilder.withLongOpt("number_of_concept").withDescription("number of output keywords").withArgName("Integer").withType(Integer.class).hasArg().create("n")); options.addOption(OptionBuilder.withLongOpt("max_keyword_length").withDescription("maximum length of multi-word expressions").withArgName("Integer").withType(Integer.class).hasArg().create("m")); options.addOption(OptionBuilder.withLongOpt("local_frequency_threshold").withDescription("min number of occurrences in a text").withArgName("Integer").withType(Integer.class).hasArg().create("l")); options.addOption(OptionBuilder.withLongOpt("number_of_threads").withDescription("number of threads used by the program").withType(Integer.class).withArgName("ONE | TWO | FOUR | SIX | EIGHT | TEN | TWELVE").hasArg().create("t")); options.addOption(OptionBuilder.withLongOpt("prefer_specific_concept").withDescription("give a boost to more specific key-concept (multi-word)").withArgName("NO | WEAK | MEDIUM | STRONG | MAX").hasArg().create("p")); options.addOption(OptionBuilder.withLongOpt("column_configuration").withDescription("specify the input file column configuration\neg: CUSTOM_0,9,6 token is 0, lemma is 9, pos is 6").withArgName("TOKEN_LEMMA_POS | TOKEN_POS_LEMMA | CUSTOM_#token,#lemma,#pos").hasArg().create("c")); options.addOption(OptionBuilder.withLongOpt("language").withDescription("specify the language of the input file").withArgName("ENGLISH | ITALIAN | CUSTOM | CUSTOM_").hasArg().create("lang")); options.addOption(OptionBuilder.withLongOpt("tagset").withDescription("specify the tagset of the pos tagger (default is TEXTPRO)").withArgName("TEXTPRO | STANFORD | TREETAGGER | CUSTOM").hasArg().create("ts")); options.addOption(OptionBuilder.withLongOpt("group").withDescription("set the group configuration").withArgName("NONE | BY_LIST | BY_STEM | ALL_LEMMA").hasArg().create("g")); options.addOption(OptionBuilder.withLongOpt("lang_folder").withDescription("set the language folder path").withArgName("Path to the folder").hasArg().create("lp")); options.addOption(OptionBuilder.withLongOpt("new_language").withDescription("create new empty language, in your language_folder").withArgName("Language name").hasArg().create("nl")); options.addOption(OptionBuilder.withLongOpt("new_language_folder").withDescription("create new empty language folder from scratch").withArgName("Path to new language folder").hasArg().create("nf")); options.addOption("STDOUT", "standard_out", false, "print results on standard out"); options.addOption("h", "help", false, "print this message"); options.addOption("om", "only_multiword", false, "display only multi-words"); options.addOption("fas", "skip_frequency_absorption", false, "skip frequency absorption"); options.addOption("wp", "use_pattern_weight", false, "use the weight of pattern"); options.addOption("ba", "boost_acronyms", false, "boost acronyms (for scientific articles)"); options.addOption("v", "verbose", false, "verbose output"); options.addOption("us", "use_stanford", false, "use included stanford pos tagger (only english)"); options.addOption("ve", "version", false, "print version and exit"); options.addOption("s", "skip_proper_noun", false, "skip proper nouns"); options.addOption("sk", "skip_keyword_with_proper_noun", false, "skip keyword with proper nouns"); options.addOption("skw", "skip_keyword_with_not_allowed_words", false, "skip keywords that contain a keyconcept-no item"); options.addOption("r", "rerank_by_position", false, "give a boost to key-concepts on the top of the document"); options.addOption("ns", "no_synonyms", false, "disable the synonym resolution"); options.addOption("ss", "save_stanford", false, "save stanford preprocessed file"); options.addOption("nr", "no_rerank", false, "disable the re-rank function"); options.addOption("cp", "capitalize_pos", false, "capitalize token with specified pos"); options.addOption("nabs", "no_abstract_keyconcept", false, "disable the boost on abstract key-concepts"); options.addOption("nidf", "no_idf", false, "disable the boost by the idf value"); CommandLine line = null; try { line = parser.parse(options, args); //---------------------------------------boolean values if (line.hasOption("help")) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(0); } if (line.hasOption("version")) { HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); System.out.println("\n" + KD_core.getVersion() + "\n"); System.exit(0); } if (line.hasOption("no_rerank")) { configuration.no_rerank = true; } if (line.hasOption("use_pattern_weight")) { configuration.use_pattern_weight = true; } if (line.hasOption("boost_acronyms")) { configuration.boost_acronyms = true; } if (line.hasOption("no_abstract_keyconcept")) { configuration.no_abstract = true; } if (line.hasOption("no_idf")) { configuration.no_idf = true; } if (line.hasOption("no_synonyms")) { configuration.no_syn = true; } if (line.hasOption("rerank_by_position")) { configuration.rerank_by_position = true; } if (line.hasOption("capitalize_pos")) { configuration.capitalize_pos = true; } if (line.hasOption("verbose")) { configuration.verbose = true; } if (line.hasOption("skip_frequency_absorption")) { configuration.skipFrequencyAbsorption = true; } if (line.hasOption("only_multiword")) { configuration.only_multiword = true; } if (line.hasOption("skip_proper_noun")) { configuration.skip_proper_noun = true; } if (line.hasOption("skip_keyword_with_proper_noun")) { configuration.skip_keyword_with_proper_noun = true; } if (line.hasOption("skip_keyword_with_not_allowed_words")) { configuration.skip_keyword_with_not_allowed_words = true; } if (line.hasOption("use_stanford")) { useStanford = true; } if (line.hasOption("save_stanford")) { save_stanford = true; } if (line.hasOption("standard_out")) { STDOUT = true; } //-------------------------------------end boolean values //-------------------------------------properties values if (line.hasOption("number_of_threads")) { try { t = Threads.valueOf(line.getOptionValue("number_of_threads").toUpperCase()); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option number_of_threads\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("lang_folder")) { try { configuration.languagePackPath = line.getOptionValue("lang_folder"); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option lang_folder\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("new_language")) { try { String lang_name = line.getOptionValue("new_language"); KD_core kxc = new KD_core(t); kxc.createNewEmptyLanguage(lang_name.toUpperCase(),configuration); System.out.println ("\nThe new \""+lang_name.toUpperCase()+"\" language has been added to the languages.\nTo use it please specify \"CUSTOM_"+lang_name.toUpperCase()+"\" in the language parameter (-lang)."); System.exit(0); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option new_language\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("new_language_folder")) { try { String path_new_lang = line.getOptionValue("new_language_folder"); KD_Model model = new KD_Model(FileSystems.getDefault().getPath(path_new_lang)); System.out.println ("\nThe new language folder has been created in : "+model.getCurrent_language_path()+" .\nTo use it please specify \""+model.getCurrent_language_path()+"\" in the lang_folder (-lp) parameter."); System.exit(0); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option new_language_folder\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("prefer_specific_concept")) { try { configuration.prefer_specific_concept = KD_configuration.Prefer_Specific_Concept.valueOf(line.getOptionValue("prefer_specific_concept").toUpperCase()); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option prefer_speficic_concept\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("column_configuration")) { try { if (line.getOptionValue("column_configuration").toUpperCase().startsWith("CUSTOM_")) { configuration.column_configuration = ColumExtraction.valueOf("CUSTOM"); String columnPositions = line.getOptionValue("column_configuration").split("_")[1]; configuration.token_position = Integer.parseInt(columnPositions.split(",")[0].trim().replace("{", "").replace("}", "")); configuration.lemma_position = Integer.parseInt(columnPositions.split(",")[1].trim().replace("{", "").replace("}", "")); configuration.pos_position = Integer.parseInt(columnPositions.split(",")[2].trim().replace("{", "").replace("}", "")); } else { configuration.column_configuration = ColumExtraction.valueOf(line.getOptionValue("column_configuration").toUpperCase()); } } catch (Exception e) { System.out.println("\nerror: Wrong value for the option column_configuration\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("number_of_concept")) { try { configuration.numberOfConcepts = Integer.parseInt(line.getOptionValue("number_of_concept")); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option number_of_concept\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("max_keyword_length")) { try { configuration.max_keyword_length = Integer.parseInt(line.getOptionValue("max_keyword_length")); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option max_keyword_length\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("local_frequency_threshold")) { try { configuration.local_frequency_threshold = Integer.parseInt(line.getOptionValue("local_frequency_threshold")); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option local_frequency_threshold\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("group")) { try { configuration.group_by = Group.valueOf(line.getOptionValue("group").toUpperCase()); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option group\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("language")) { try { String stringlang = line.getOptionValue("language").toUpperCase().split("_")[0]; lang = Language.valueOf(stringlang); if (line.getOptionValue("language").toUpperCase().split("_",2).length > 1){ String custom_lang = line.getOptionValue("language").toUpperCase().split("_",2)[1]; lang.set_Custom_Language(custom_lang); } } catch (Exception e) { e.printStackTrace(); System.out.println("\nerror: Wrong value for the option language\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } if (line.hasOption("tagset")) { try { configuration.tagset = KD_configuration.Tagset.valueOf(line.getOptionValue("tagset").toUpperCase()); } catch (Exception e) { System.out.println("\nerror: Wrong value for the option tagset\n"); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(500); formatter.printHelp("KD_Keyphrase_Digger", options); System.exit(1); } } } catch (Exception exp) { System.out.println("Unexpected exception:" + exp.getMessage()); } if (lang != Language.ENGLISH && useStanford) { System.err.println("Please specify english as language if you want to use the integrated Stanford POS Tagger"); System.exit(1); } //////////////////////////////////// end command line parser//////////////////////////////// KD_core kxc = new KD_core(t); KD_Model model = new KD_Model(Paths.get(configuration.languagePackPath )); KD_loader.run_the_updater(lang, configuration.languagePackPath); StanfordCoreNLP pipeline = null; System.out.println("Processor detected: " + cores + " used " + t.toString().toLowerCase()); if (useStanford) { System.out.println("Load Stanford Model"); System.out.println("Override column configuration"); configuration.column_configuration = ColumExtraction.CUSTOM; configuration.token_position = 0; configuration.pos_position = 2; configuration.lemma_position = 1; System.out.println("Force tagset to STANFORD"); configuration.tagset = KD_configuration.Tagset.STANFORD; Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, pos, lemma"); props.setProperty("pos.model", configuration.languagePackPath +"/"+ lang.name() + "/tagger/stanford_model/english-bidirectional-distsim.tagger"); props.setProperty("pos.nthreads", "4"); pipeline = new StanfordCoreNLP(props); } @SuppressWarnings("unchecked") List filePaths = line.getArgList(); StringBuffer processed_filecontent = null; LinkedList concept_list = null; for (String filePath : filePaths) { List files = new ArrayList(); if (new File(filePath).isDirectory()) { String[] extensions = new String[]{"txt", "txp"}; files = (List) FileUtils.listFiles(new File(filePath), extensions, true); } else if (new File(filePath).isFile()) { files.add(new File(filePath)); } for (File f : files) { System.out.println("Starting the extraction for file:" + FilenameUtils.getBaseName(f.getAbsolutePath())); long startTime = System.currentTimeMillis(); kxc = new KD_core(t); processed_filecontent = new StringBuffer(); if (useStanford) { if (configuration.verbose) { System.out.print("Start Stanford preprocessing...."); } long startTimeStanford = System.currentTimeMillis(); try { processed_filecontent = new StringBuffer(); Annotation annotation = pipeline.process((new String(Files.readAllBytes(Paths.get(f.getAbsolutePath())))).replace("\r\n"," ").replace("\n"," ") ); List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); for (CoreMap sentence : sentences) { List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); for (CoreLabel c : tokens) { processed_filecontent.append(c.get(CoreAnnotations.OriginalTextAnnotation.class) + "\t" + c.get(CoreAnnotations.LemmaAnnotation.class) + "\t" + c.get(CoreAnnotations.PartOfSpeechAnnotation.class) + "\n"); } } } catch (Exception e) { e.printStackTrace(); } long estimatedTimeStanford = System.currentTimeMillis() - startTimeStanford; if (configuration.verbose) { System.out.println("End Stanford preprocessing in : " + estimatedTimeStanford); } if (save_stanford) { try { BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameUtils.getFullPath(f.getAbsolutePath()) + FilenameUtils.getBaseName(f.getAbsolutePath()) + "_stanford.tsv"), "UTF-8")); out.write(processed_filecontent.toString()); out.close(); } catch (Exception e) { e.printStackTrace(); } } } concept_list = kxc.extractExpressions(lang, configuration, f.getAbsolutePath(), processed_filecontent); if (STDOUT) { int r = 1; StringBuffer output = new StringBuffer(); for (KD_keyconcept k : concept_list) { k.getMostUsedVariation(); switch (configuration.group_by){ case ALL_LEMMA: output.append(r + "." + "\t" + Joiner.on(" ").join(k.getMostUsedVariation()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() + " lemma: "+ k.getLemmaArray().toString() +"\n"); break; case NONE: output.append(r + "." + "\t" + Joiner.on(" ").join(k.getTokenArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() + " lemma: "+ k.getLemmaArray().toString() +"\n"); break; case BY_LIST: output.append(r + "." + "\t" + Joiner.on(" ").join(k.getTokenArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n"); break; case BY_STEM: output.append(r + "." + "\t" + Joiner.on(" ").join(k.getStemArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n"); break; } r++; } if (output.toString().length() > 0) { System.out.println(output.toString().substring(0, output.toString().length() - 1)); } else { System.out.println("Mmmmm no keywords extracted... That's strange...."); } } else { int r = 1; StringBuffer output = new StringBuffer(); output.append("rank\tkeyword\tsynonyms\tscore\tfrequency\n"); for (KD_keyconcept k : concept_list) { output.append(r + "\t" + k.getString() + "\t" + k.getSysnonyms() + "\t" + k.score + "\t" + k.frequency + "\n"); r++; } try { BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameUtils.getFullPath(f.getAbsolutePath()) + FilenameUtils.getBaseName(f.getAbsolutePath()) + ".tsv"), "UTF-8")); out.write(output.toString()); out.close(); } catch (Exception e) { e.printStackTrace(); } } long estimatedTime = System.currentTimeMillis() - startTime; System.out.println("Finished in: " + estimatedTime + " milliseconds\n"); } } //kxc.key_concept_extraction(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy