Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*******************************************************************************
* Copyright 2015 Defense Health Agency (DHA)
*
* If your use of this software does not include any GPLv2 components:
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* ----------------------------------------------------------------------------
* If your use of this software includes any GPLv2 components:
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*******************************************************************************/
package prerna.poi.main;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import java.util.Scanner;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.tika.exception.TikaException;
import org.xml.sax.SAXException;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.EnglishGrammaticalRelations;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.TreeGraphNode;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.util.CoreMap;
import prerna.algorithm.nlp.NLPSingletons;
import prerna.algorithm.nlp.NaturalLanguageProcessingHelper;
public class ProcessNLP {
private List triples; // list of all triples stored after running findVerbTriples()
private LexicalizedParser lp;
private static final Logger LOGGER = LogManager.getLogger(ProcessNLP.class.getName());
public ProcessNLP(){
lp = NLPSingletons.getInstance().getLp();
}
public List generateTriples(String[] files) throws IOException {
triples = new ArrayList();
int i = 0;
int size = files.length;
for(; i < size; i++){
processFile(files[i]);
}
createOccuranceCount();
lemmatize();
return triples;
}
/**
* Generates all the triples for the file or web-page
* @param file String representing the file path
* @throws IOException
*/
private void processFile(String file) throws IOException {
// Returns a list of all the sentences in a file/web-page
List fileSentences = readDoc(file);
int i = 0;
int size = fileSentences.size();
for(; i < size; i++) {
String sentence = fileSentences.get(i);
List tdl = new ArrayList();
List taggedWords = new ArrayList();
boolean sentenceParsable = NaturalLanguageProcessingHelper.createDepList(lp, sentence, tdl, taggedWords); //create dependencies
if(sentenceParsable)
{
Hashtable> nodeHash = new Hashtable>();
Hashtable negHash = new Hashtable();
// fill the hashtable between the grammatical part of speech to the words in the sentence
NaturalLanguageProcessingHelper.setTypeDependencyHash(tdl, nodeHash);
generateTriples(sentence, file.substring(file.lastIndexOf(File.separator)+1), taggedWords, negHash, nodeHash);
}
}
}
/**
* Returns a list of all the sentences in a file or web-page
* @param file String representing the file path
* @return List containing the sentences in the file
* @throws NLPException
*/
public List readDoc(String file) throws IOException {
// Use a text extractor to grab all the sentences in a file or web-page
List fileSentences = new ArrayList();
try {
if(file.contains("http")) {
LOGGER.info("Extracting text from a web-page...");
readFile(TextExtractor.websiteTextExtractor(file), fileSentences);
}
if(file.endsWith(".doc") || file.endsWith(".docx")){
LOGGER.info("Extracting text from a word document...");
readFile(TextExtractor.fileTextExtractor(file), fileSentences);
}
if(file.endsWith(".txt"))
{
LOGGER.info("Extracting text from a text file...");
readFile(TextExtractor.fileTextExtractor(file), fileSentences);
}
} catch (IOException | SAXException | TikaException e) {
e.printStackTrace();
throw new IOException("Error extrating text from document");
}
return fileSentences;
}
/**
* Fills in a List with all the sentences in a string
* @param text The String containing sentences
* @param fileSentences The List to fill with all the sentences in the String passed in
*/
private void readFile(String text, List fileSentences) {
Pattern p = Pattern.compile("(? taggedWords, Hashtable negHash, Hashtable> nodeHash)
{
NaturalLanguageProcessingHelper.createNegations(negHash, nodeHash);
// I ate the sandwich. -> I, ate, sandwich (?the? is included in expanded object.)
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.NOMINAL_SUBJECT, EnglishGrammaticalRelations.DIRECT_OBJECT);
// The man has been killed by the police. -> man, killed, police (Requires Collapsed Dependencies)
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.AGENT, EnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT);
// TODO: this no longer exists in jar version 3.5, need to replace controlling_subject
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.CONTROLLING_SUBJECT, EnglishGrammaticalRelations.DIRECT_OBJECT);
// I sat on the chair. -> I, sat, on (without our code)
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.NOMINAL_PASSIVE_SUBJECT, EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER);
// He is tall. -> He, tall, is (without our code)
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.NOMINAL_SUBJECT, EnglishGrammaticalRelations.COPULA);
// She looks beautiful. -> She, looks, beautiful
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.NOMINAL_SUBJECT, EnglishGrammaticalRelations.ADJECTIVAL_COMPLEMENT);
// I will sit on the chair. -> I, sit, on (without our code)
findTriples(sentence, documentName, taggedWords, negHash, nodeHash, EnglishGrammaticalRelations.NOMINAL_SUBJECT, EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER);
}
/**
*
* @param sentence
* @param documentName
* @param taggedWords
* @param negHash
* @param nodeHash
* @param subjR
* @param objR
*/
public void findTriples(
String sentence,
String documentName,
List taggedWords,
Hashtable negHash,
Hashtable> nodeHash,
GrammaticalRelation subjR,
GrammaticalRelation objR)
{
// based on the subjects and objects now find the predicates
Vector dobjV = nodeHash.get(objR);
Vector subjV = nodeHash.get(subjR);
if(dobjV != null && subjV != null)
{
int i = 0;
int dobjVSize = dobjV.size();
for(; i < dobjVSize; i++)
{
TreeGraphNode obj = dobjV.get(i).dep();
TreeGraphNode pred = dobjV.get(i).gov();
String predicate = pred.value(); // Note: value doesn't return the number, while toString does
String preposition = null;
if (dobjV.get(i).toString().contains("prep")) {
obj = NaturalLanguageProcessingHelper.findPrepObject(dobjV, subjV, nodeHash, EnglishGrammaticalRelations.PREPOSITIONAL_MODIFIER, EnglishGrammaticalRelations.PREPOSITIONAL_OBJECT);
if (obj == null) {
continue;
}
preposition = dobjV.get(i).dep().toString();
}
// now find the subject
int j = 0;
int subjVSize = subjV.size();
for(; j < subjVSize; j++)
{
TreeGraphNode subj = subjV.get(j).dep();
TreeGraphNode dep2 = subjV.get(j).gov();
// Test to make sure both words have the same governor -> i.e. they are connected in the sentence
if((dep2.toString()).equalsIgnoreCase(pred.toString()))
{
// JJ = adjective
// If predicate gets stored as adjective (usually occurs for adjectivial), gets stored as predicate while verb gets stored as obj -> switch the two
if (subj.label().tag().contains("JJ")) {
TreeGraphNode tempNode = pred;
pred = obj;
obj = tempNode;
}
//CORE TRIPLES FOUND
TripleWrapper tripleContainer = new TripleWrapper();
tripleContainer.setObj1(formatString(subj.value(), false, true));
tripleContainer.setPred(formatString(pred.value(), false, true));
tripleContainer.setObj2(formatString(obj.value(), false, true));
//FINDING EXTENSION OF SUBJECT****
// find if complemented
// need to do this only if the subj is not a noun
// final subject
TreeGraphNode altPredicate = NaturalLanguageProcessingHelper.findCompObject(dep2, nodeHash);
if(!subj.label().tag().contains("NN") && ( nodeHash.containsKey(EnglishGrammaticalRelations.CLAUSAL_COMPLEMENT) || nodeHash.containsKey(EnglishGrammaticalRelations.XCLAUSAL_COMPLEMENT)))
{
subj = NaturalLanguageProcessingHelper.findComplementNoun(subj, dep2, nodeHash, EnglishGrammaticalRelations.CLAUSAL_COMPLEMENT);
if(!subj.label().tag().contains("NN")){
subj = NaturalLanguageProcessingHelper.findCompSubject(dep2, nodeHash);
}
}
String finalSubject = NaturalLanguageProcessingHelper.getFullNoun(subj);
String finalObject = NaturalLanguageProcessingHelper.getFullNoun(obj);
finalObject = finalObject + NaturalLanguageProcessingHelper.findPrepNounForPredicate(pred, nodeHash);
//FINDING EXTENSION OF PREDICATE****
// find the negators for the predicates next
if(negHash.containsKey(pred + "")|| negHash.containsKey(altPredicate + "")) {
predicate = "NOT " + predicate;
}
// I sat on a chair -> I -> sat on -> chair
if (preposition != null) {
predicate += preposition;
}
//EXTENSION OF OBJECT FOUND****
// fulcrum on the nsubj to see if there is an NNP in the vicinity
// if(finalObject.indexOf(predicate) < 0 && predicate.indexOf(finalObject) < 0) {
// LOGGER.info("VERB Triple: " + finalSubject + "<<>>" + predicate + "<<>>" + finalObject);
// }
tripleContainer.setObj1Expanded(formatString(finalSubject.toString(), true, false));// part of future SetTriple
tripleContainer.setPredExpanded(formatString(predicate.toString(), true, false));
tripleContainer.setObj2Expanded(formatString(finalObject.toString(), true, false));
tripleContainer.setDocName(documentName);
tripleContainer.setSentence(sentence);
triples.add(tripleContainer);
}
}
}
}
}
/**
* Format the string before putting it in the TripleWrapper. If the input is empty or null, the return is "NA".
* @param s The String to format
* @param clean Boolean if you want to clean the String by replacing unwanted characters
* @param toLower Boolean if you want to make the String lowercase
* @return The cleaned up version of the String
*/
private String formatString(String s, boolean clean, boolean toLower) {
if(s == null || s.isEmpty()) {
return "NA";
}
String retString = s;
if(clean) {
retString = s.replace("'", ",").replace("`", ",");
}
if(toLower) {
retString = retString.toLowerCase();
}
return retString;
}
private void createOccuranceCount() {
Hashtable termCounts = new Hashtable();
int i = 0;
int numTriples = triples.size();
for(; i < numTriples; i++){
String[] keys = new String[]{triples.get(i).getObj1(), triples.get(i).getPred(), triples.get(i).getObj2()};
for(String key : keys) {
if(termCounts.containsKey(key)) {
int val = termCounts.get(key);
termCounts.put(key, val+1);
} else {
termCounts.put(key, 1);
}
}
}
i = 0;
for(; i < numTriples; i++) {
triples.get(i).setObj1Count(termCounts.get(triples.get(i).getObj1()));
triples.get(i).setPredCount(termCounts.get(triples.get(i).getPred()));
triples.get(i).setObj2Count(termCounts.get(triples.get(i).getObj2()));
}
}
//TODO: figure out how to put this in find triples
public void lemmatize() {
StanfordCoreNLP pipeline = new StanfordCoreNLP();
int i = 0;
int size = triples.size();
for(; i < size; i++){
Annotation document = new Annotation(triples.get(i).getPred());
pipeline.annotate(document);
List sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// LOGGER.info("lemmatized " + token.get(LemmaAnnotation.class));
// LOGGER.info("original " + triples.get(i).getPred());
triples.get(i).setPred(token.get(LemmaAnnotation.class));
}
}
}
}
}