semRewrite.datesandnumber.DateAndNumbersGeneration Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
There is a newer version: 1.1
package semRewrite.datesandnumber;

/*
Copyright 2014-2015 IPsoft

Author: Nagaraj Bhat [email protected]
        Rashmi Rao

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA  02111-1307 USA 
 */

import com.articulate.sigma.WSD;
import com.articulate.sigma.WordNet;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;

import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DateAndNumbersGeneration {

	static enum DateComponent {
		DAY, MONTH, YEAR
	}
	
	static final Pattern HOUR_MINUTE_PATTERN = Pattern.compile("^T([0-9]{2}):([0-9]{2})$");
	static final Pattern HOUR_MINUTE_SECOND_PATTERN = Pattern.compile("^T([0-9]{2}):([0-9]{2}):([0-9]{2})$");
	static final Pattern YEAR_MONTH_TIME_PATTERN = Pattern.compile("^([0-9X]{4})(\\-[0-9]{2})?(\\-[0-9]{2})?T([0-9]{2}):([0-9]{2})(:[0-9]{2})?");
	static final Pattern POS_TAG_REMOVER = Pattern.compile("(\\/[A-Z]+)$");

	List measureTerms = new ArrayList();
	
	/** ***************************************************************
	 */
	public DateAndNumbersGeneration() {
	}
	
	/** ***************************************************************
	 * * Adds the time terms to SumoTerms list.
	 * @param input: 
	 */
	private void generateSumoTimeTerms(List timesList, Utilities utilities) {

		for (TimeInfo times : timesList) {
			if ((times.getSecond() != null) || (times.getMinute() != null) || (times.getHour() != null)) {
				//StringBuffer timeFn = new StringBuffer();
				if (times.getSecond() != null) {
					utilities.sumoTerms.add("second("+"time-"+utilities.timeCount+","+times.getSecond()+"-"+times.getWordIndex() +")");
				}
				if (times.getMinute() != null) {
					utilities.sumoTerms.add("minute("+"time-"+utilities.timeCount+","+times.getMinute()+"-"+times.getWordIndex()+")");
				}
				if (times.getHour() != null) {
					utilities.sumoTerms.add("hour("+"time-"+utilities.timeCount+","+times.getHour()+"-"+times.getWordIndex()+")");
				}
				String tokenRoot = utilities.populateRootWord(times.getWordIndex());
				if (tokenRoot != null) {
					utilities.sumoTerms.add("time("+tokenRoot+","+"time-"+utilities.timeCount+")");
				}
				utilities.timeCount++;
			}
		}
	}
	
	/** ***************************************************************
	 */
	private String lemmatizeWord(IndexedWord measuredEntity) {
		
		String value = measuredEntity.value();
		if(!measuredEntity.tag().equals("NNP") || !measuredEntity.tag().equals("NNPS")) {
			value = measuredEntity.lemma();
		}
		return value;
	}

	/** ***************************************************************
	 * * Generates a set of sumo terms corresponding to measure functions. Identifies the unit of measurement,
	 *  value of measurement and entity of the measurement by performing a graph search. 
	 * @param input: The tokens, count of measure, utility value.
	 */
	private void measureFn(Tokens token, int count, Utilities utilities) {

		IndexedWord tokenNode = utilities.StanfordDependencies.getNodeByIndex(token.getId());
		IndexedWord unitOfMeasurementNode = utilities.StanfordDependencies.getParent(tokenNode);
		IndexedWord measuredEntity = null;
		String posTagRemover = null;
		String unitOfMeasurementStr = "";
		String sumoUnitOfMeasure = "";
		List visitedNodes = new ArrayList();
		Matcher posTagRemoverMatcher = null;
		String measuredEntityStr = null;
		boolean flag = false;
		//int x = 0;
		// fix to remove comma in numbers
		if (token.getWord().contains(",")) {
			token.setWord(token.getWord().replaceAll(",", ""));
		}
		if (unitOfMeasurementNode != null) {
			//unitOfMeasurementStr = lemmatizeWord(unitOfMeasurementNode);
			unitOfMeasurementStr = unitOfMeasurementNode.word();
			measuredEntity = utilities.StanfordDependencies.getParent(unitOfMeasurementNode);
			visitedNodes.add(unitOfMeasurementNode.toString()+"-"+unitOfMeasurementNode.index());
		}
		if ((measuredEntity == null) && (unitOfMeasurementNode != null)) {
			for (SemanticGraphEdge e : utilities.StanfordDependencies.getOutEdgesSorted(unitOfMeasurementNode)) {
				if ((e.getRelation().toString().equals("nsubj")) || (e.getRelation().toString().equals("dobj"))) {
					measuredEntity = e.getDependent();
					flag = true;
					break;
				}
			}
		}
		else if ((measuredEntity == null) && (unitOfMeasurementNode == null)){
			return;
		}
		while ((measuredEntity != null) && (!flag)) {
			measuredEntityStr = measuredEntity.value()+"-"+measuredEntity.index();
			if (!visitedNodes.contains(measuredEntityStr)) {
				visitedNodes.add(measuredEntityStr);
			}
			posTagRemoverMatcher = POS_TAG_REMOVER.matcher(measuredEntity.toString());
			if(posTagRemoverMatcher.find()) {
				posTagRemover = posTagRemoverMatcher.group(1);
				if(Utilities.nounTags.contains(posTagRemover)) {
					break;
				}
				//IndexedWord tempMeasuredEntity = StanfordDependencies.getParent(measuredEntity);
				if (utilities.StanfordDependencies.getParent(measuredEntity) == null) {
					Set childrenSet = utilities.StanfordDependencies.getChildren(measuredEntity);
					//which means it is unitOfMeasurementNode. Hence remove infinite looping condition
					if ((childrenSet.size()==1)) {
						measuredEntity = unitOfMeasurementNode;
						//String lemmatizedWord = lemmatizeWord(measuredEntity);
						utilities.sumoTerms.add("measure(" + measuredEntity.word() + "-" + measuredEntity.index() + ", measure" + count + ")");
						utilities.sumoTerms.add("unit(measure" + count + ", "+ "memberCount" + ")");
						utilities.sumoTerms.add("value(measure" + count + ", " + token.getWord()+ ")");
						utilities.sumoTerms.add("valueToken("+token.getWord()+","+token.getWord()+"-"+token.getId()+")");
						flag = true;
						return;
					}
					IndexedWord measuredEntity_temp = null;
					for (IndexedWord child : childrenSet) {
						String childPosTagRemover = null;
						posTagRemoverMatcher = POS_TAG_REMOVER.matcher(child.toString());
						//childPosTagRemover = posTagRemoverMatcher.group(1);
						if (posTagRemoverMatcher.find()) {
							childPosTagRemover = posTagRemoverMatcher.group(1); 
						}
						if (!(visitedNodes.contains(child.toString()+"-"+child.index())) && (Utilities.nounTags.contains(childPosTagRemover.replaceFirst("\\/", "")))){
							if ((utilities.StanfordDependencies.reln(measuredEntity, child) != null) && (utilities.StanfordDependencies.reln(measuredEntity, child).getShortName().equals("nsubj"))) {
								measuredEntity = child;
								visitedNodes.add(child.toString()+"-"+child.index());
								flag = true;
								break;
							}
							measuredEntity_temp = child;
							visitedNodes.add(child.toString()+"-"+child.index());
						}
					}
					if (!flag) {
						measuredEntity = measuredEntity_temp;
						flag = true; 
					}
					
				}
				else {
					measuredEntity = utilities.StanfordDependencies.getParent(measuredEntity);
				}
			}
		}
		if (measuredEntity != null) {
			String lemmatizedWord = lemmatizeWord(measuredEntity);
			utilities.sumoTerms.add("measure(" + lemmatizedWord + "-" + measuredEntity.index() + ", measure" + count + ")");
		}
		sumoUnitOfMeasure = lemmatizeWord(unitOfMeasurementNode);
		sumoUnitOfMeasure = WSD.getBestDefaultSUMOsense(sumoUnitOfMeasure, 1);
		if ((sumoUnitOfMeasure != null) && (!sumoUnitOfMeasure.isEmpty())) {
			sumoUnitOfMeasure = sumoUnitOfMeasure.replaceAll("[^\\p{Alpha}\\p{Digit}]+","");
		}
		else 
		{
			if ((measuredEntity != null) && (unitOfMeasurementStr.equals(measuredEntity.value()))) {
				unitOfMeasurementStr = "memberCount";
			}
			sumoUnitOfMeasure = unitOfMeasurementStr;
		}
		utilities.sumoTerms.add("unit(measure" + count + ", "+ sumoUnitOfMeasure + ")");
		utilities.sumoTerms.add("value(measure" + count + ", " + token.getWord() + ")");
		utilities.sumoTerms.add("valueToken("+token.getWord()+","+token.getWord()+"-"+token.getId()+")");
		WordNet.wn.initOnce();
	}
	
	/** ***************************************************************
	 * * Returns the measure terms added in the sumo list
	 * @param input: utility value
	 */
	public List getMeasureTerms(Utilities utilities) {
		
		return utilities.sumoTerms;
	}
	
	/** ***************************************************************
	 * * Sets the measure terms
	 * @param input: List of measure terms
	 */
	public void setMeasureTerms(List measureTerms) {
		
		this.measureTerms = measureTerms;
	}
	
	/** ***************************************************************
	 * * If a duplicate token is present in the tokens of datelist returns true
	 * @param input: Token value to compare and the list of tokens.
	 */
	private boolean checkTokenInList(Tokens token, List tempDateList) {
		
		for(Tokens tempToken : tempDateList) {
			if(tempToken.equals(token)) {
				return true;
			}
		}
		return false;
	}
	
	/** ***************************************************************
	 * * Generates a set of sumo terms corresponding to time functions.  Uses the class TimeInfo to hold 
	 * the required time data present in the tokens. Also does the job of grouping time set together from 
	 * the tokens. Foer eg. In sentence 3 hr, 30 min. the tokens 3 and 30 would be separate. This function 
	 * analyzes that 3 hour and 30 min would belong to same type object and groups them together.
	 * @param input: normalizedTokenIdMap, utility value, List of date tokens,date andDurationHandler,presentTimeToken and PreviousTimeToken
	 */
	private List processTime(List tokenIdNormalizedTimeMap, Utilities utilities, List tempDateList, DatesAndDuration datesandDurationHandler, Tokens presentTimeToken, Tokens prevTimeToken) {

		List timesList = new ArrayList();
		for (String timeToken : tokenIdNormalizedTimeMap) {
			int id = Integer.valueOf(timeToken.split("@")[0]);
			String timeStr = timeToken.split("@")[1];
			Matcher hourMinPatternMatcher = HOUR_MINUTE_PATTERN.matcher(timeStr);
			Matcher hourMinSecPatternMatcher = HOUR_MINUTE_SECOND_PATTERN.matcher(timeStr);
			Matcher yearMonthTimePatternMatcher = YEAR_MONTH_TIME_PATTERN.matcher(timeStr);
			TimeInfo timeObj = new TimeInfo();
			if (hourMinPatternMatcher.find() && (utilities.StanfordDependencies.getNodeByIndexSafe(id)!=null)) {
				timeObj.setMinute(hourMinPatternMatcher.group(2));
				timeObj.setHour(hourMinPatternMatcher.group(1));
				timeObj.setWordIndex(id);
			} 
			else if (hourMinSecPatternMatcher.find() && (utilities.StanfordDependencies.getNodeByIndexSafe(id)!=null)) {
				timeObj.setMinute(hourMinSecPatternMatcher.group(2));
				timeObj.setHour(hourMinSecPatternMatcher.group(1));
				timeObj.setSecond(hourMinSecPatternMatcher.group(3));
				timeObj.setWordIndex(id);
			} 
			else if (yearMonthTimePatternMatcher.find() && (utilities.StanfordDependencies.getNodeByIndexSafe(id)!=null)) {
				String year = yearMonthTimePatternMatcher.group(1);
				int tokenCnt = new StanfordDateTimeExtractor().getTokenCount() + 1;
				if (!year.equals("XXXX")) {
					Tokens yearToken = new Tokens();
					yearToken.setWord(year);
					yearToken.setTokenType("YEAR");
					yearToken.setId(id);
					
					if(!checkTokenInList(yearToken, tempDateList)) {
						presentTimeToken = yearToken;
						datesandDurationHandler.mergeDates(yearToken, tempDateList, prevTimeToken, utilities);
						prevTimeToken = presentTimeToken;
					}
				}
				if (yearMonthTimePatternMatcher.group(2) != null) {
					String month = yearMonthTimePatternMatcher.group(2).replaceAll("\\-", "");
					Tokens monthToken = new Tokens();
					Tokens digitMonthToken = new Tokens();
					monthToken.setWord(month);
					monthToken.setId(id);
					monthToken.setTokenType("MONTH");
					
					digitMonthToken.setWord(Utilities.MONTHS.get(Integer.valueOf(month) - 1));
					digitMonthToken.setId(id);
					digitMonthToken.setTokenType("MONTH");
					
					if(!checkTokenInList(monthToken, tempDateList) && !checkTokenInList(digitMonthToken, tempDateList)) {
						presentTimeToken = digitMonthToken;
						datesandDurationHandler.mergeDates(digitMonthToken, tempDateList, prevTimeToken, utilities);
						prevTimeToken = presentTimeToken;
					}
				}
				if (yearMonthTimePatternMatcher.group(3) != null) {
					String day = yearMonthTimePatternMatcher.group(3).replaceAll("\\-", "");
					Tokens dayToken = new Tokens();
					dayToken.setWord(day);
					dayToken.setId(id);
					dayToken.setTokenType("DAYS");
					
					if(!checkTokenInList(dayToken, tempDateList)) {
						presentTimeToken = dayToken;
						datesandDurationHandler.mergeDates(dayToken, tempDateList, prevTimeToken, utilities);
						prevTimeToken = presentTimeToken;
					}
				}
				timeObj.setMinute(yearMonthTimePatternMatcher.group(5));
				timeObj.setHour(yearMonthTimePatternMatcher.group(4));
				timeObj.setWordIndex(id);
				if (yearMonthTimePatternMatcher.group(6) != null) {
					timeObj.setSecond(yearMonthTimePatternMatcher.group(6));
					timeObj.setWordIndex(id);
				}

			}
			if (!containsTimeInfo(timesList,timeObj)) {
				timesList.add(timeObj);
			}	
		}
		return timesList;
	}
	
	/** ***************************************************************
	 * * Returns true if a timeInfo object is present in the list of timeInfo.
	 * @param input: List of timeInfo and timeInfo Object to be compared.
	 */
	public boolean containsTimeInfo(List timeList, TimeInfo timeObject) {
		
		for (TimeInfo t : timeList) {
			if (t.equals(timeObject)) {
				return true;
			}
		}
		return false;
	}
	
		
	/** ***************************************************************
	 * * Categorizes the input into Date, Number, Duration and Time. 
	 * Adds the necessary sumo terms from the respective functions and filter any duplicates if present.
	 * @param input: List of tokens, StanfordDateTimeExtractor object and a ClauseSubstitutor.
	 */
	public List generateSumoTerms(List tokensList, StanfordDateTimeExtractor stanfordParser) {

		DatesAndDuration datesandDurationHandler = new DatesAndDuration();
		Utilities utilities = new Utilities();
		utilities.StanfordDependencies = stanfordParser.getDependencies();
		List tokenIdNormalizedTimeMap = new ArrayList();
		int numberCount = 1;
		List tempDateList = new ArrayList<>();
		Tokens presentDateToken = new Tokens();
		Tokens prevDateToken = null;
		Tokens presentTimeToken = new Tokens();
		Tokens prevTimeToken = null;
		Tokens numberToken = new Tokens();
		Tokens presentDurationToken = new Tokens();
		Tokens prevDurationToken = null;
		for(Tokens token : tokensList) {
			presentDateToken = token;
			presentDurationToken = token;
			switch(token.getNer()) {
				case "DATE"  : datesandDurationHandler.processDateToken(token, utilities, tempDateList, prevDateToken);
					            prevDateToken = presentDateToken;
							    break;
				case "NUMBER":
				case "ORDINAL":
				case "PERCENT" : measureFn(token,numberCount, utilities); ++numberCount;  //processNumber(token,stanfordParser.getDependencyList());
								 break;
				case "DURATION" :numberToken = datesandDurationHandler.processDuration(token,utilities, prevDurationToken); 
				                 if(numberToken != null) {
				                	 measureFn(numberToken, numberCount, utilities);
				                	 ++numberCount;
				                 }
				                 prevDurationToken = presentDurationToken;
				                 break;
				case "TIME" : tokenIdNormalizedTimeMap.add(token.getId() + "@" + token.getNormalizedNer());
			}
		}
		List timesList = processTime(tokenIdNormalizedTimeMap,utilities, tempDateList, datesandDurationHandler, presentTimeToken, prevTimeToken);
		datesandDurationHandler.generateSumoDateTerms(utilities, tempDateList);
		datesandDurationHandler.processUnhandledDuration(utilities);
		generateSumoTimeTerms(timesList,utilities);
		utilities.filterSumoTerms();
		return utilities.sumoTerms;
	}
}