All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.patterns.ScorePhrases Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.patterns;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import javax.json.Json;
import javax.json.JsonArray;
import javax.json.JsonArrayBuilder;
import javax.json.JsonObjectBuilder;
import javax.json.JsonReader;
import javax.json.JsonValue;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.patterns.dep.ApplyDepPatterns;
import edu.stanford.nlp.patterns.surface.*;
import edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass.WordScoring;
import edu.stanford.nlp.patterns.PhraseScorer.Normalization;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.stats.TwoDimensionalCounter;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.ArgumentParser.Option;
import edu.stanford.nlp.util.logging.Redwood;

public class ScorePhrases  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(ScorePhrases.class);

  Map writtenInJustification = new HashMap<>();

  ConstantsAndVariables constVars = null;

  @Option(name = "phraseScorerClass")
  Class phraseScorerClass = ScorePhrasesAverageFeatures.class;
  PhraseScorer phraseScorer = null;

  public ScorePhrases(Properties props, ConstantsAndVariables cv){
    ArgumentParser.fillOptions(this, props);
    this.constVars = cv;
    try {
      phraseScorer = phraseScorerClass
          .getConstructor(ConstantsAndVariables.class).newInstance(constVars);
    } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
      throw new RuntimeException(e);
    }
    ArgumentParser.fillOptions(phraseScorer, props);
  }

  public Counter chooseTopWords(Counter newdt,
                                                 TwoDimensionalCounter terms,
                                                 Counter useThresholdNumPatternsForTheseWords,
                                                 Set ignoreWords, double thresholdWordExtract) {

    Iterator termIter = Counters.toPriorityQueue(newdt).iterator();
    Counter finalwords = new ClassicCounter<>();

    while (termIter.hasNext()) {

      if (finalwords.size() >= constVars.numWordsToAdd){
        break;
      }
      CandidatePhrase w = termIter.next();
      if (newdt.getCount(w) < thresholdWordExtract) {
        Redwood.log(ConstantsAndVariables.extremedebug,"not adding word " + w + " and any later words because the score " + newdt.getCount(w) + " is less than the threshold of  " + thresholdWordExtract);
        break;
      }
      assert (newdt.getCount(w) != Double.POSITIVE_INFINITY);
      if (useThresholdNumPatternsForTheseWords.containsKey(w)
          && numNonRedundantPatterns(terms, w) < constVars.thresholdNumPatternsApplied) {
        Redwood
            .log(
                "extremePatDebug",
                "Not adding "
                    + w
                    + " because the number of non redundant patterns are below threshold of " +  constVars.thresholdNumPatternsApplied + ":"
                    + terms.getCounter(w).keySet());
        continue;
      }
      CandidatePhrase matchedFuzzy = null;
      if (constVars.minLen4FuzzyForPattern > 0 && ignoreWords != null)
        matchedFuzzy = ConstantsAndVariables.containsFuzzy(ignoreWords, w, constVars.minLen4FuzzyForPattern);
      if (matchedFuzzy == null) {
        Redwood.log("extremePatDebug", "adding word " + w);
        finalwords.setCount(w, newdt.getCount(w));
      } else {
        Redwood
            .log("extremePatDebug", "not adding " + w
                + " because it matched " + matchedFuzzy
                + " in common English word");
        ignoreWords.add(w);
      }
    }
     String nextTen = "";
     int n = 0;
     while (termIter.hasNext()) {
     n++;
     if (n > 10)
     break;
       CandidatePhrase w = termIter.next();
     nextTen += ";\t" + w + ":" + newdt.getCount(w);
     }
     Redwood.log(Redwood.DBG, "Next ten phrases were " + nextTen);
    return finalwords;
  }

  public static  void removeKeys(TwoDimensionalCounter counter,
      Collection removeKeysCollection) {

    for (E key : removeKeysCollection)
      counter.remove(key);
  }

  private double numNonRedundantPatterns(
      TwoDimensionalCounter terms, CandidatePhrase w) {
    Object[] pats = terms.getCounter(w).keySet()
        .toArray();
    int numPat = 0;
    for (int i = 0; i < pats.length; i++) {
      //String pati = constVars.getPatternIndex().get(pats[i]).toString();
      String pati = pats[i].toString();
      boolean contains = false;
      for (int j = i + 1; j < pats.length; j++) {
        //String patj = constVars.getPatternIndex().get(pats[j]).toString();
        String patj = pats[j].toString();
        if (patj.contains(pati) || pati.contains(patj)) {
          contains = true;
          break;
        }
      }
      if (!contains)
        numPat++;
    }
    return numPat;
  }

  public Counter learnNewPhrases(
    String label,
    PatternsForEachToken patternsForEachToken,
    Counter patternsLearnedThisIter,
    Counter allSelectedPatterns,
    CollectionValuedMap> tokensMatchedPatterns,
    Counter scoreForAllWordsThisIteration,
    TwoDimensionalCounter terms,
    TwoDimensionalCounter wordsPatExtracted,
    TwoDimensionalCounter patternsAndWords4Label,
    String identifier, Set ignoreWords) throws IOException, ClassNotFoundException {

    boolean computeProcDataFreq = false;
    if (Data.processedDataFreq == null) {
      computeProcDataFreq = true;
      Data.processedDataFreq = new ClassicCounter<>();
      assert Data.rawFreq != null;
    }

    Set alreadyIdentifiedWords = new HashSet<>(constVars.getLearnedWords(label).keySet());
    alreadyIdentifiedWords.addAll(constVars.getSeedLabelDictionary().get(label));
    Counter words = learnNewPhrasesPrivate(label,
        patternsForEachToken, patternsLearnedThisIter, allSelectedPatterns, alreadyIdentifiedWords,
        tokensMatchedPatterns, scoreForAllWordsThisIteration, terms,
        wordsPatExtracted,  patternsAndWords4Label,
        identifier, ignoreWords, computeProcDataFreq);

    //constVars.addLabelDictionary(label, words.keySet());


    return words;
  }

  void runParallelApplyPats(Map sents, String label, E pattern, TwoDimensionalCounter wordsandLemmaPatExtracted,
                            CollectionValuedMap> matchedTokensByPat, Set alreadyLabeledWords) {

    Redwood.log(Redwood.DBG, "Applying pattern " + pattern + " to a total of " + sents.size() + " sentences ");
    List notAllowedClasses = new ArrayList<>();
    List sentids = CollectionUtils.toList(sents.keySet());
    if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){
      for(String l: constVars.getAnswerClass().keySet()){
        if(!l.equals(label)){
          notAllowedClasses.add(l);
        }
      }
      notAllowedClasses.add("OTHERSEM");
    }


    Map surfacePatternsLearnedThisIterConverted = null;
    Map depPatternsLearnedThisIterConverted = null;

    if(constVars.patternType.equals(PatternFactory.PatternType.SURFACE)) {
      surfacePatternsLearnedThisIterConverted = new HashMap<>();
      String patternStr = null;
      try{
        patternStr = pattern.toString(notAllowedClasses);
        TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), patternStr);
        surfacePatternsLearnedThisIterConverted.put(pat, pattern);
      }catch(Exception e){
        log.info("Error applying patterrn " + patternStr + ". Probably an ill formed pattern (can be because of special symbols in label names). Contact the software developer.");
        throw e;
      }
    }else if(constVars.patternType.equals(PatternFactory.PatternType.DEP)){
      depPatternsLearnedThisIterConverted = new HashMap<>();
      SemgrexPattern pat = SemgrexPattern.compile(pattern.toString(notAllowedClasses), new edu.stanford.nlp.semgraph.semgrex.Env(constVars.env.get(label).getVariables()));
      depPatternsLearnedThisIterConverted.put(pat, pattern);
    } else
    throw new UnsupportedOperationException();

    //Apply the patterns and extract candidate phrases
    int num;
    int numThreads = constVars.numThreads;

    //If number of sentences is less, do not create so many threads
    if(sents.size() < 50)
      numThreads = 1;

    if (numThreads == 1)
      num = sents.size();
    else
      num = sents.size() / (numThreads - 1);

    ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
    List, CollectionValuedMap>, Set>>> list = new ArrayList<>();


    for (int i = 0; i < numThreads; i++) {

      Callable, CollectionValuedMap>, Set>> task = null;

      if(pattern.type.equals(PatternFactory.PatternType.SURFACE))
        //Redwood.log(Redwood.DBG, "Applying pats: assigning sentences " + i*num + " to " +Math.min(sentids.size(), (i + 1) * num) + " to thread " + (i+1));
        task = new ApplyPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num,
          Math.min(sentids.size(), (i + 1) * num)), surfacePatternsLearnedThisIterConverted, label,
          constVars.removeStopWordsFromSelectedPhrases,
          constVars.removePhrasesWithStopWords, constVars);
      else
        task = new ApplyDepPatterns(sents, num == sents.size() ? sentids : sentids.subList(i * num,
          Math.min(sentids.size(), (i + 1) * num)), depPatternsLearnedThisIterConverted, label,
          constVars.removeStopWordsFromSelectedPhrases,
          constVars.removePhrasesWithStopWords, constVars);


      Future, CollectionValuedMap>, Set>> submit = executor
          .submit(task);
      list.add(submit);
    }

    // Now retrieve the result
    for (Future, CollectionValuedMap>,Set>> future : list) {
      try{
        Triple, CollectionValuedMap>,Set> result = future
            .get();
        Redwood.log(ConstantsAndVariables.extremedebug, "Pattern " + pattern + " extracted phrases " + result.first());
        wordsandLemmaPatExtracted.addAll(result.first());
        matchedTokensByPat.addAll(result.second());
        alreadyLabeledWords.addAll(result.third());
      }catch(Exception e){
        executor.shutdownNow();
        throw new RuntimeException(e);
      }
    }
    executor.shutdown();
  }
/*
  void runParallelApplyPats(Map> sents, Set sentIds, String label, Counter patternsLearnedThisIter,  TwoDimensionalCounter, Integer> wordsandLemmaPatExtracted,
                            CollectionValuedMap> matchedTokensByPat) throws InterruptedException, ExecutionException{
    List keyset = new ArrayList(sentIds);
    List notAllowedClasses = new ArrayList();

    if(constVars.doNotExtractPhraseAnyWordLabeledOtherClass){
      for(String l: constVars.getAnswerClass().keySet()){
        if(!l.equals(label)){
          notAllowedClasses.add(l+":"+l);
        }
      }
      notAllowedClasses.add("OTHERSEM:OTHERSEM");
    }

    //Apply the patterns and extract candidate phrases
    int num = 0;
    if (constVars.numThreads == 1)
      num = keyset.size();
    else
      num = keyset.size() / (constVars.numThreads - 1);
    ExecutorService executor = Executors.newFixedThreadPool(constVars.numThreads);
    List, Integer>, CollectionValuedMap>>>> list = new ArrayList, Integer>, CollectionValuedMap>>>>();
    for (int i = 0; i < constVars.numThreads; i++) {

      Callable, Integer>, CollectionValuedMap>>> task = null;
      Map patternsLearnedThisIterConverted = new HashMap();
      for(Integer pindex : patternsLearnedThisIter.keySet()){
        SurfacePattern p = constVars.getPatternIndex().get(pindex);
        TokenSequencePattern pat = TokenSequencePattern.compile(constVars.env.get(label), p.toString(notAllowedClasses));
        patternsLearnedThisIterConverted.put(pat, pindex);
      }

      task = new ApplyPatternsMulti(sents, keyset.subList(i * num,
        Math.min(keyset.size(), (i + 1) * num)), patternsLearnedThisIterConverted, label,
        constVars.removeStopWordsFromSelectedPhrases,
        constVars.removePhrasesWithStopWords, constVars);

      Future, Integer>, CollectionValuedMap>>> submit = executor
        .submit(task);
      list.add(submit);
    }

    // Now retrieve the result
    for (Future, Integer>, CollectionValuedMap>>> future : list) {
      try{
        Pair, Integer>, CollectionValuedMap>> result = future
          .get();
        wordsandLemmaPatExtracted.addAll(result.first());
        matchedTokensByPat.addAll(result.second());
      }catch(Exception e){
        executor.shutdownNow();
        throw new RuntimeException(e);
      }
    }
    executor.shutdown();
  }
*/

  protected Map> getSentences(Map> sentids) {
    try{

      Set files = new HashSet<>();

      Map> sentsAll  = new HashMap<>();
      CollectionValuedMap sentIds2Pats = new CollectionValuedMap<>();
      for(Map.Entry> setEn: sentids.entrySet()){
        if(!sentsAll.containsKey(setEn.getKey()))
          sentsAll.put(setEn.getKey(), new HashMap<>());
        for(String s: setEn.getValue()){
          sentIds2Pats.add(s, setEn.getKey());
          if(constVars.batchProcessSents){
            File f = Data.sentId2File.get(s);
            assert f!= null : "How come no file for sentence " + s;
            files.add(f);
          }
        }
      }

      if(constVars.batchProcessSents){
        for(File f: files){
          Map sentsf = IOUtils.readObjectFromFile(f);
          for(Map.Entry s: sentsf.entrySet()){
            for(E pat: sentIds2Pats.get(s.getKey()))
              sentsAll.get(pat).put(s.getKey(), s.getValue());
          }
        }
      }else{
        for(Map.Entry s: Data.sents.entrySet()){
          for(E pat: sentIds2Pats.get(s.getKey()))
            sentsAll.get(pat).put(s.getKey(), s.getValue());
        }
      }

//      /System.out.println("All sentences are " + sentsAll.entrySet().stream().map( x -> constVars.patternIndex.get(x.getKey())+":"+x.getValue()).collect(Collectors.toList()));
      return sentsAll;
    }catch(ClassNotFoundException e){
      throw new RuntimeException(e);
    }catch(IOException e1){
      throw new RuntimeException(e1);

    }
  }


  public void applyPats(Counter patterns, String label, TwoDimensionalCounter wordsandLemmaPatExtracted,
                        CollectionValuedMap> matchedTokensByPat, Set alreadyLabeledWords){
 //   Counter patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter();
 //   Counter patternsLearnedThisIterRest = new ClassicCounter();
//    Set specialWords = constVars.invertedIndex.getSpecialWordsList();

    for(Map.Entry en: constVars.env.entrySet()){
      en.getValue().getVariables().putAll(ConstantsAndVariables.globalEnv.getVariables());
    }

    Map> sentencesForPatterns = getSentences(constVars.invertedIndex.queryIndex(patterns.keySet()));

    for(Map.Entry> en: sentencesForPatterns.entrySet()){
      runParallelApplyPats(en.getValue(), label, en.getKey(), wordsandLemmaPatExtracted, matchedTokensByPat, alreadyLabeledWords);
    }

    Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  }
  /*
  public void applyPats(Counter patterns, String label, boolean computeDataFreq,  TwoDimensionalCounter, Integer> wordsandLemmaPatExtracted,
                        CollectionValuedMap> matchedTokensByPat) throws ClassNotFoundException, IOException, InterruptedException, ExecutionException{
    Counter patternsLearnedThisIterConsistsOnlyGeneralized = new ClassicCounter();
    Counter patternsLearnedThisIterRest = new ClassicCounter();
    Set specialWords = constVars.invertedIndex.getSpecialWordsList();
    List extremelySmallStopWordsList = Arrays.asList(".",",","in","on","of","a","the","an");

    for(Entry en: patterns.entrySet()){
      Integer pindex = en.getKey();
      SurfacePattern p = constVars.getPatternIndex().get(pindex);
      String[] n = p.getSimplerTokensNext();
      String[] pr = p.getSimplerTokensPrev();
      boolean rest = false;
      if(n!=null){
        for(String e: n){
          if(!specialWords.contains(e)){
            rest = true;
            break;
          }
        }
      }
      if(rest == false && pr!=null){
        for(String e: pr){
          if(!specialWords.contains(e) && !extremelySmallStopWordsList.contains(e)){
            rest = true;
            break;
          }
        }
      }
      if(rest)
        patternsLearnedThisIterRest.setCount(en.getKey(), en.getValue());
      else
        patternsLearnedThisIterConsistsOnlyGeneralized.setCount(en.getKey(), en.getValue());
    }



    Map> sentidswithfilerest = constVars.invertedIndex.getFileSentIdsFromPats(patternsLearnedThisIterRest.keySet(), constVars.getPatternIndex());

    if (constVars.batchProcessSents) {
      List filesToLoad;
      if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0)
        filesToLoad = Data.sentsFiles;
      else{
        filesToLoad = new ArrayList();
        for (String fname : sentidswithfilerest.keySet()) {
          String filename;
//          if(!constVars.usingDirForSentsInIndex)
//            filename = constVars.saveSentencesSerDir+"/"+fname;
//          else
            filename = fname;
          filesToLoad.add(new File(filename));
        }
      }

      for (File fname : filesToLoad) {
        Redwood.log(Redwood.DBG, "Applying patterns to sents from " + fname);
        Map> sents = IOUtils.readObjectFromFile(fname);

        if(sentidswithfilerest != null && !sentidswithfilerest.isEmpty()){

          String filename;
//          if(constVars.usingDirForSentsInIndex)
//            filename = constVars.saveSentencesSerDir+"/"+fname.getName();
//          else
            filename = fname.getAbsolutePath();

          Set sentIDs = sentidswithfilerest.get(filename);
          if (sentIDs != null){
            this.runParallelApplyPats(sents, sentIDs, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
          } else
            Redwood.log(Redwood.DBG, "No sentIds for " + filename  + " in the index for the keywords from the patterns! The index came up with these files: " + sentidswithfilerest.keySet());
        }
        if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
          this.runParallelApplyPats(sents, sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
        }

        if (computeDataFreq){
          Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
          Data.fileNamesUsedToComputeRawFreq.add(fname.getName());
        }
      }

      //Compute Frequency from the files not loaded using the invertedindex query. otherwise, later on there is an error.
      if(computeDataFreq){
        for(File f: Data.sentsFiles){
          if(!Data.fileNamesUsedToComputeRawFreq.contains(f.getName())){
            Map> sents = IOUtils.readObjectFromFile(f);
            Data.computeRawFreqIfNull(sents, constVars.numWordsCompound);
            Data.fileNamesUsedToComputeRawFreq.add(f.getName());
          }
        }
      }

    } else {

      if (sentidswithfilerest != null && !sentidswithfilerest.isEmpty()) {
        String filename = CollectionUtils.toList(sentidswithfilerest.keySet()).get(0);
        Set sentids = sentidswithfilerest.get(filename);
        if (sentids != null) {
          this.runParallelApplyPats(Data.sents, sentids, label, patternsLearnedThisIterRest, wordsandLemmaPatExtracted, matchedTokensByPat);
        } else
          throw new RuntimeException("How come no sentIds for " + filename  + ". Index keyset is " + constVars.invertedIndex.getKeySet());
      }
      if(patternsLearnedThisIterConsistsOnlyGeneralized.size() > 0){
        this.runParallelApplyPats(Data.sents, Data.sents.keySet(), label, patternsLearnedThisIterConsistsOnlyGeneralized, wordsandLemmaPatExtracted, matchedTokensByPat);
      }
      Data.computeRawFreqIfNull(Data.sents, constVars.numWordsCompound);
    }
    Redwood.log(Redwood.DBG, "# words/lemma and pattern pairs are " + wordsandLemmaPatExtracted.size());
  }
  */

  private void statsWithoutApplyingPatterns(Map sents, PatternsForEachToken patternsForEachToken,
      Counter patternsLearnedThisIter, TwoDimensionalCounter wordsandLemmaPatExtracted){
    for (Entry sentEn : sents.entrySet()) {
      Map> pat4Sent = patternsForEachToken.getPatternsForAllTokens(sentEn.getKey());

      if (pat4Sent == null) {
        throw new RuntimeException("How come there are no patterns for "
            + sentEn.getKey());
      }
      for (Entry> en : pat4Sent
          .entrySet()) {
        CoreLabel token = null;
        Set p1 = en.getValue();

//        Set p1 = en.getValue().first();
//        Set p2 = en.getValue().second();
//        Set p3 = en.getValue().third();
        for (E index : patternsLearnedThisIter.keySet()) {

          if (p1.contains(index)) {
            if (token == null)
              token = sentEn.getValue().getTokens().get(en.getKey());
            wordsandLemmaPatExtracted.incrementCount(CandidatePhrase.createOrGet(token.word(), token.lemma()), index);
          }
        }
      }
    }
  }

  private Counter learnNewPhrasesPrivate(
    String label,
    PatternsForEachToken patternsForEachToken,
    Counter patternsLearnedThisIter,
    Counter allSelectedPatterns,
    Set alreadyIdentifiedWords, CollectionValuedMap> matchedTokensByPat,
    Counter scoreForAllWordsThisIteration,
    TwoDimensionalCounter terms,
    TwoDimensionalCounter wordsPatExtracted,
    TwoDimensionalCounter patternsAndWords4Label,
    String identifier, Set ignoreWords, boolean computeProcDataFreq) throws IOException, ClassNotFoundException {

    Set alreadyLabeledWords = new HashSet<>();
    if (constVars.doNotApplyPatterns) {
      // if want to get the stats by the lossy way of just counting without
      // applying the patterns
      ConstantsAndVariables.DataSentsIterator sentsIter = new ConstantsAndVariables.DataSentsIterator(constVars.batchProcessSents);
      while(sentsIter.hasNext()) {
        Pair, File> sentsf = sentsIter.next();
        this.statsWithoutApplyingPatterns(sentsf.first(), patternsForEachToken, patternsLearnedThisIter, wordsPatExtracted);
      }
    } else {
      if (patternsLearnedThisIter.size() > 0) {
        this.applyPats(patternsLearnedThisIter, label, wordsPatExtracted, matchedTokensByPat, alreadyLabeledWords);

      }
    }
    if(computeProcDataFreq){
      if (!phraseScorer.wordFreqNorm.equals(Normalization.NONE)) {
        Redwood.log(Redwood.DBG, "computing processed freq");
        for (Entry fq : Data.rawFreq.entrySet()) {
          Double in = fq.getValue();
          if (phraseScorer.wordFreqNorm.equals(Normalization.SQRT))
            in = Math.sqrt(in);

          else if (phraseScorer.wordFreqNorm.equals(Normalization.LOG))
            in = 1 + Math.log(in);
          else
            throw new RuntimeException("can't understand the normalization");

          assert !in.isNaN(): "Why is processed freq nan when rawfreq is " + in;

          Data.processedDataFreq.setCount(fq.getKey(), in);
        }
      } else
        Data.processedDataFreq = Data.rawFreq;
    }

    if (constVars.wordScoring.equals(WordScoring.WEIGHTEDNORM)) {

      for (CandidatePhrase en : wordsPatExtracted.firstKeySet()) {

        if (!constVars.getOtherSemanticClassesWords().contains(en) && (en.getPhraseLemma() ==null || !constVars.getOtherSemanticClassesWords().contains(CandidatePhrase.createOrGet(en.getPhraseLemma()))) && !alreadyLabeledWords.contains(en)){
          terms.addAll(en, wordsPatExtracted.getCounter(en));
        }
      }
      removeKeys(terms, constVars.getStopWords());

      Counter phraseScores = phraseScorer.scorePhrases(label,
          terms, wordsPatExtracted, allSelectedPatterns,
          alreadyIdentifiedWords, false);
      System.out.println("count for word U.S. is " + phraseScores.getCount(CandidatePhrase.createOrGet("U.S.")));
      Set ignoreWordsAll ;
      if(ignoreWords !=null && !ignoreWords.isEmpty()){
        ignoreWordsAll = CollectionUtils.unionAsSet(ignoreWords, constVars.getOtherSemanticClassesWords());
      }
      else
        ignoreWordsAll = new HashSet<>(constVars.getOtherSemanticClassesWords());

      ignoreWordsAll.addAll(constVars.getSeedLabelDictionary().get(label));
      ignoreWordsAll.addAll(constVars.getLearnedWords(label).keySet());
      System.out.println("ignoreWordsAll contains word U.S. is " + ignoreWordsAll.contains(CandidatePhrase.createOrGet("U.S.")));

      Counter finalwords = chooseTopWords(phraseScores, terms,
          phraseScores, ignoreWordsAll, constVars.thresholdWordExtract);

      phraseScorer.printReasonForChoosing(finalwords);

      scoreForAllWordsThisIteration.clear();
      Counters.addInPlace(scoreForAllWordsThisIteration, phraseScores);

      Redwood.log(
          ConstantsAndVariables.minimaldebug,
          "\n\n## Selected Words for " + label + " : "
              + Counters.toSortedString(finalwords, finalwords.size(),
                  "%1$s:%2$.2f", "\t"));

      if(constVars.goldEntities != null){
        Map goldEntities4Label = constVars.goldEntities.get(label);
        if(goldEntities4Label != null) {
          StringBuffer s = new StringBuffer();
          finalwords.keySet().stream().forEach(x ->
            s.append(x.getPhrase() + (goldEntities4Label.containsKey(x.getPhrase()) ? ":"+goldEntities4Label.get(x.getPhrase()) : ":UKNOWN")+"\n"));

          Redwood.log(ConstantsAndVariables.minimaldebug,
            "\n\n## Gold labels for selected words for label " + label + " : " + s.toString());
        } else
          Redwood.log(Redwood.DBG, "No gold entities provided for label " + label);
      }

      if (constVars.outDir != null && !constVars.outDir.isEmpty()) {
        String outputdir = constVars.outDir + "/" + identifier +"/"+ label;
        IOUtils.ensureDir(new File(outputdir));
        TwoDimensionalCounter reasonForWords = new TwoDimensionalCounter<>();
        for (CandidatePhrase word : finalwords.keySet()) {
          for (E l : wordsPatExtracted.getCounter(word).keySet()) {
            for (CandidatePhrase w2 : patternsAndWords4Label.getCounter(l)) {
              reasonForWords.incrementCount(word, w2);
            }
          }
        }
        Redwood.log(ConstantsAndVariables.minimaldebug, "Saving output in " + outputdir);
        String filename = outputdir + "/words.json";

        // the json object is an array corresponding to each iteration - of list
        // of objects,
        // each of which is a bean of entity and reasons

        JsonArrayBuilder obj = Json.createArrayBuilder();
        if (writtenInJustification.containsKey(label)
            && writtenInJustification.get(label)) {
          JsonReader jsonReader = Json.createReader(new BufferedInputStream(
              new FileInputStream(filename)));
          JsonArray objarr = jsonReader.readArray();
          for (JsonValue o : objarr)
            obj.add(o);
          jsonReader.close();

        }
        JsonArrayBuilder objThisIter = Json.createArrayBuilder();

        for (CandidatePhrase w : reasonForWords.firstKeySet()) {
          JsonObjectBuilder objinner = Json.createObjectBuilder();

          JsonArrayBuilder l = Json.createArrayBuilder();
          for (CandidatePhrase w2 : reasonForWords.getCounter(w).keySet()) {
            l.add(w2.getPhrase());
          }
          JsonArrayBuilder pats = Json.createArrayBuilder();
          for (E p : wordsPatExtracted.getCounter(w)) {
            pats.add(p.toStringSimple());
          }
          objinner.add("reasonwords", l);
          objinner.add("patterns", pats);
          objinner.add("score", finalwords.getCount(w));
          objinner.add("entity", w.getPhrase());
          objThisIter.add(objinner.build());
        }
        obj.add(objThisIter);

        // Redwood.log(ConstantsAndVariables.minimaldebug, channelNameLogger,
        // "Writing justification at " + filename);
        IOUtils.writeStringToFile(StringUtils.normalize(StringUtils.toAscii(obj.build().toString())), filename, "ASCII");
        writtenInJustification.put(label, true);
      }
      if (constVars.justify) {
        Redwood.log(Redwood.DBG, "\nJustification for phrases:\n");
        for (CandidatePhrase word : finalwords.keySet()) {
          Redwood.log(
              Redwood.DBG, "Phrase " +
              word
                  + " extracted because of patterns: \t"
                  + Counters.toSortedString(wordsPatExtracted.getCounter(word),
                      wordsPatExtracted.getCounter(word).size(), "%1$s:%2$f",
                      "\n"));
        }
      }
      // if (usePatternResultAsLabel)
      // if (answerLabel != null)
      // labelWords(sents, commonEngWords, finalwords.keySet(),
      // patterns.keySet(), outFile);
      // else
      // throw new RuntimeException("why is the answer label null?");

      return finalwords;
    } else if (constVars.wordScoring.equals(WordScoring.BPB)) {
      Counters.addInPlace(terms, wordsPatExtracted);
      Counter maxPatWeightTerms = new ClassicCounter<>();
      Map wordMaxPat = new HashMap<>();
      for (Entry> en : terms.entrySet()) {
        Counter weights = new ClassicCounter<>();
        for (E k : en.getValue().keySet())
          weights.setCount(k, patternsLearnedThisIter.getCount(k));
        maxPatWeightTerms.setCount(en.getKey(), Counters.max(weights));
        wordMaxPat.put(en.getKey(), Counters.argmax(weights));
      }
      Counters.removeKeys(maxPatWeightTerms, alreadyIdentifiedWords);
      double maxvalue = Counters.max(maxPatWeightTerms);
      Set words = Counters.keysAbove(maxPatWeightTerms,
          maxvalue - 1e-10);
      CandidatePhrase bestw = null;
      if (words.size() > 1) {
        double max = Double.NEGATIVE_INFINITY;
        for (CandidatePhrase w : words) {
          if (terms.getCount(w, wordMaxPat.get(w)) > max) {
            max = terms.getCount(w, wordMaxPat.get(w));
            bestw = w;
          }
        }
      } else if (words.size() == 1)
        bestw = words.iterator().next();
      else
        return new ClassicCounter<>();

      Redwood.log(ConstantsAndVariables.minimaldebug, "Selected Words: " + bestw);

      return Counters.asCounter(Arrays.asList(bestw));
    }

    else
      throw new RuntimeException("wordscoring " + constVars.wordScoring
          + " not identified");
  }

  // private void combineExternalFeatures(Counter words) {
  //
  // for (Entry en : words.entrySet()) {
  // Integer num = constVars.distSimClusters.get(en.getKey());
  // if (num == null)
  // num = -1;
  // // Double score = externalWeights.getCount(num);
  // // if not present in the clusters, take minimum of the scores of the
  // // individual words
  // // if (num == null) {
  // // for (String w : en.getKey().split("\\s+")) {
  // // Integer n = constVars.distSimClusters.get(w);
  // // if (n == null)
  // // continue;
  // // score = Math.min(score, externalWeights.getCount(n));
  // // }
  // // }
  // words.setCount(en.getKey(), en.getValue() *
  // constVars.distSimWeights.getCount(num));
  // }
  // }

  Counter getLearnedScores() {
    return phraseScorer.getLearnedScores();
  }

  // private Counter getLookAheadWeights(Counter words,
  // Counter externalWordWeights, Set alreadyIdentifiedWords,
  // String label,
  // Counter currentAllPatternWeights,
  // TwoDimensionalCounter allPatternsandWords) throws
  // IOException {
  // System.out.println("size of patterns weight counter is " +
  // currentAllPatternWeights.size());
  //
  // DirectedWeightedMultigraph graph = new
  // DirectedWeightedMultigraph(org.jgrapht.graph.DefaultWeightedEdge.class);
  //
  // if (Data.googleNGram.size() == 0) {
  // Data.loadGoogleNGrams();
  // }
  //
  // TwoDimensionalCounter allPatsAndWords =
  // TwoDimensionalCounter.reverseIndexOrder(allPatternsandWords);
  // System.out.println("We have patterns for " + allPatsAndWords.size() +
  // " words ");
  // TwoDimensionalCounter lookaheadweights = new
  // TwoDimensionalCounter();
  // // Counter weights = new ClassicCounter();
  //
  // for (Entry en : words.entrySet()) {
  // Counter pats = new
  // ClassicCounter(allPatsAndWords.getCounter(en.getKey()));
  // for (SurfacePattern p : pats.keySet()) {
  // pats.setCount(p, pats.getCount(p) * currentAllPatternWeights.getCount(p));
  // }
  //
  // for (Pair p : Counters.topKeysWithCounts(pats, 10))
  // {
  //
  // for (Entry pen :
  // allPatternsandWords.getCounter(p.first()).entrySet()) {
  // if (pen.getKey().equals(en.getKey()) ||
  // alreadyIdentifiedWords.contains(pen.getKey()) ||
  // constVars.otherSemanticClasses.contains(pen.getKey()))
  // continue;
  //
  // double ngramWt = 1.0;
  // if (Data.googleNGram.containsKey(pen.getKey())) {
  // assert (Data.rawFreq.containsKey(pen.getKey()));
  // ngramWt = (1 + Data.rawFreq.getCount(pen.getKey())) / (Data.rawFreq.size()
  // + Data.googleNGram.getCount(pen.getKey()));
  // }
  // double wordweight = ngramWt;// (minExternalWordWeight +
  // // externalWordWeights.getCount(pen.getKey()))
  // // * p.second() * (0.1 +
  // // currentAllPatternWeights.getCount(p.first()))
  // // * ;
  // // if (wordweight != 0)
  // if (wordweight == 0) {
  // // System.out.println("word weight is zero for " + pen.getKey() +
  // // " and the weights were " +
  // // externalWordWeights.getCount(pen.getKey()) + ";" + p.second() +
  // // ";"
  // // + (0.1 + currentPatternWeights.getCount(p.first())) + ";" +
  // // ngramWt);
  // } else {
  // lookaheadweights.setCount(en.getKey(), pen.getKey(), Math.log(wordweight));
  // graph.addVertex(en.getKey());
  // graph.addVertex(pen.getKey());
  // DefaultWeightedEdge e = graph.addEdge(en.getKey(), pen.getKey());
  // graph.setEdgeWeight(e, lookaheadweights.getCount(en.getKey(),
  // pen.getKey()));
  // }
  //
  // }
  //
  // }
  // // weights.setCount(en.getKey(),
  // // Math.exp(Counters(lookaheadweights.getCounter(en.getKey()))));
  //
  // }
  // Counter weights = new ClassicCounter();
  // for (Entry> en :
  // lookaheadweights.entrySet()) {
  // // List> sorted =
  // // Counters.toSortedListWithCounts(en.getValue());
  // // double val = sorted.get((int) Math.floor(sorted.size() / 2)).second();
  // double wt = Math.exp(en.getValue().totalCount() / en.getValue().size());
  //
  // weights.setCount(en.getKey(), wt);
  // }
  // // Counters.expInPlace(weights);
  // // List tk = Counters.topKeys(weights, 50);
  // // BufferedWriter w = new BufferedWriter(new FileWriter("lookahead_" +
  // // answerLabel, true));
  // // for (String s : tk) {
  // // w.write(s + "\t" + weights.getCount(s) + "\t" +
  // // lookaheadweights.getCounter(s) + "\n");
  // // }
  // // w.close();
  // // BufferedWriter writer = new BufferedWriter(new FileWriter("graph.gdf"));
  // // writeGraph(writer, graph);
  // System.out.println("done writing graph");
  // Redwood.log(ConstantsAndVariables.minimaldebug, "calculated look ahead weights for " +
  // weights.size() + " words");
  //
  // return weights;
  // }

  // void writeGraph(BufferedWriter w, DirectedWeightedMultigraph g) throws IOException {
  // w.write("nodedef>name VARCHAR\n");
  // for (String n : g.vertexSet()) {
  // w.write(n + "\n");
  // }
  // w.write("edgedef>node1 VARCHAR,node2 VARCHAR, weight DOUBLE\n");
  // for (DefaultWeightedEdge e : g.edgeSet()) {
  // w.write(g.getEdgeSource(e) + "," + g.getEdgeTarget(e) + "," +
  // g.getEdgeWeight(e) + "\n");
  // }
  // w.close();
  // }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy