semRewrite.datesandnumber.DatesAndDuration Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package semRewrite.datesandnumber;
/*
Copyright 2014-2015 IPsoft
Author: Nagaraj Bhat [email protected]
Rashmi Rao
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
import semRewrite.datesandnumber.DateInfo;
import semRewrite.datesandnumber.Utilities;
import edu.stanford.nlp.ling.IndexedWord;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DatesAndDuration {
static final Pattern YEAR_PATTERN = Pattern.compile("^[0-9]{4}$");
static final Pattern MMDDYYYY_PATTERN = Pattern.compile("^([0-9]{1,2})(\\/|\\-|\\.)([0-9]{1,2})(\\/|\\-|\\.)([0-9]{4})$");
static final Pattern DATE_PATTERN = Pattern.compile("^(0?[1-9]|[12][0-9]|3[01])(th)?$");
static final Pattern DURATION_PATTERN = Pattern.compile("^P([0-9])+[A-Z]");
/** ***************************************************************
*/
public void processDateToken(Tokens presentDateToken, Utilities utilities, List tempDateList, Tokens prevDateToken) {
Matcher yearPatternMatcher = YEAR_PATTERN.matcher(presentDateToken.getWord());
Matcher mmddyyyyPatternMatcher = MMDDYYYY_PATTERN.matcher(presentDateToken.getWord());
Matcher datePatternMatcher = DATE_PATTERN.matcher(presentDateToken.getWord());
if (Utilities.MONTHS.contains(presentDateToken.getWord().toLowerCase())) {
presentDateToken.setTokenType("MONTH");
mergeDates(presentDateToken, tempDateList, prevDateToken, utilities);
}
else if (Utilities.DAYS.contains(presentDateToken.getWord().toLowerCase())) {
String tokenRoot = utilities.populateRootWord(presentDateToken.getId());
if (tokenRoot != null) {
utilities.sumoTerms.add("time("+tokenRoot+","+"time-"+utilities.timeCount+")");
}
addDateToList(null, null, null, presentDateToken.getWord(), presentDateToken.getId(), utilities);
utilities.sumoTerms.add("day(time-"+utilities.timeCount+","+presentDateToken.getWord()+")");
utilities.timeCount++;
}
else if (yearPatternMatcher.find()) {
presentDateToken.setTokenType("YEAR");
mergeDates(presentDateToken, tempDateList, prevDateToken, utilities);
}
else if (mmddyyyyPatternMatcher.find()) {
String tokenRoot = utilities.populateRootWord(presentDateToken.getId());
if (tokenRoot != null) {
utilities.sumoTerms.add("time("+tokenRoot+","+"time-"+utilities.timeCount+")");
}
addDateToList(mmddyyyyPatternMatcher.group(3), Utilities.MONTHS.get(Integer.valueOf(mmddyyyyPatternMatcher.group(1))-1),
mmddyyyyPatternMatcher.group(5), null, presentDateToken.getId(), utilities);
}
else if (datePatternMatcher.find()) {
if(presentDateToken.getWord().contains("th")) {
presentDateToken.setWord(presentDateToken.getWord().replace("th", ""));
}
presentDateToken.setTokenType("DAYS");
mergeDates(presentDateToken, tempDateList, prevDateToken, utilities);
}
}
/** ***************************************************************
*/
private void populateDateInfo(semRewrite.datesandnumber.DateInfo tempDate, String year, String month, String day, String weekDay) {
if(day != null) {
tempDate.setDay(day);
}
if(month != null) {
tempDate.setMonth(month);
}
if(year != null) {
tempDate.setYear(year);
}
if(weekDay != null) {
tempDate.setWeekDay(weekDay);
}
}
/** ***************************************************************
*/
private void addDateToList(String day, String month, String year, String weekDay, int index, Utilities utilities) {
semRewrite.datesandnumber.DateInfo tempDate = new semRewrite.datesandnumber.DateInfo();
populateDateInfo(tempDate, year, month, day, weekDay);
tempDate.addWordIndex(index);
tempDate.setTimeCount(utilities.timeCount);
utilities.datesList.add(tempDate);
}
/** ***************************************************************
*/
private void addInfoToDateList(List tempDateList, Utilities utilities) {
semRewrite.datesandnumber.DateInfo tempDateInfo = new semRewrite.datesandnumber.DateInfo();
for(Tokens token : tempDateList) {
switch(token.getTokenType()) {
case "MONTH" : tempDateInfo.setMonth(token.getWord());
break;
case "YEAR" : tempDateInfo.setYear(token.getWord());
break;
case "DAYS" : tempDateInfo.setDay(token.getWord());
break;
}
tempDateInfo.addWordIndex(token.getId());
}
if(!tempDateInfo.isEmpty()) {
utilities.datesList.add(tempDateInfo);
}
}
/** ***************************************************************
*/
private boolean containsDateToken(List tempDateList, Tokens token) {
for(Tokens tempToken : tempDateList) {
if(tempToken.getTokenType().equals(token.getTokenType())) {
return false;
}
}
return true;
}
/** ***************************************************************
*/
private void clearTempDateList(List tempDateList, Utilities utilities, Tokens token) {
addInfoToDateList(tempDateList, utilities);
tempDateList.clear();
tempDateList.add(token);
}
/** ***************************************************************
*/
public void mergeDates(Tokens presentDateToken, List tempDateList, Tokens prevDateToken, Utilities utilities) {
if(prevDateToken == null) {
tempDateList.add(presentDateToken);
return;
}
if((presentDateToken.getId() - prevDateToken.getId()) <= 2 && !presentDateToken.getTokenType().equals(prevDateToken.getTokenType())) {
if(containsDateToken(tempDateList,presentDateToken)) {
tempDateList.add(presentDateToken);
} else {
clearTempDateList(tempDateList, utilities, presentDateToken);
}
} else {
clearTempDateList(tempDateList, utilities, presentDateToken);
}
}
/** ***************************************************************
*/
public List generateSumoDateTerms(Utilities utilities, List tempDateList){
if(!tempDateList.isEmpty()) {
addInfoToDateList(tempDateList, utilities);
tempDateList.clear();
}
List dateList = utilities.datesList;
for (semRewrite.datesandnumber.DateInfo date : dateList) {
if ((date.getYear() != null) || (date.getMonth() != null) || (date.getDay() != null)) {
if (date.getDay() != null) {
utilities.sumoTerms.add("day(time-"+utilities.timeCount+","+date.getDay()+")");
}
if (date.getMonth() != null) {
utilities.sumoTerms.add("month(time-"+utilities.timeCount+","+date.getMonth()+")");
}
if (date.getYear() != null) {
utilities.sumoTerms.add("year(time-"+utilities.timeCount+","+date.getYear()+")");
}
String tokenRoot = utilities.populateRootWord(date.getWordIndex());
date.setTimeCount(utilities.timeCount);
if (tokenRoot != null) {
utilities.sumoTerms.add("time("+tokenRoot+","+"time-"+utilities.timeCount+")");
}
utilities.timeCount++;
}
}
return dateList;
}
/** ***************************************************************
*/
public Tokens processDuration(Tokens token, Utilities utilities, Tokens prevDurationToken) {
if (token.getWord().matches("[0-9]{4}\\-[0-9]{4}")) {
String years[] = token.getWord().split("-");
IndexedWord tempParent = utilities.StanfordDependencies.getNodeByIndex(token.getId());
tempParent = getAssociatedWord(utilities, tempParent);
semRewrite.datesandnumber.DateInfo newDateInfo = new semRewrite.datesandnumber.DateInfo();
newDateInfo.setYear(years[0]);
newDateInfo.addWordIndex(token.getId());
newDateInfo.setTimeCount(utilities.timeCount);
utilities.sumoTerms.add("year(time-"+utilities.timeCount+","+years[0]+")");
utilities.timeCount++;
semRewrite.datesandnumber.DateInfo endDateInfo = new semRewrite.datesandnumber.DateInfo();
endDateInfo.setYear(years[1]);
endDateInfo.addWordIndex(token.getId());
endDateInfo.setTimeCount(utilities.timeCount);
utilities.sumoTerms.add("year(time-"+utilities.timeCount+","+years[1]+")");
utilities.timeCount++;
generateDurationSumoTerms(tempParent,utilities, newDateInfo, endDateInfo);
} else {
if(prevDurationToken == null) {
Matcher durationMatcher = DURATION_PATTERN.matcher(token.getNormalizedNer());
if(durationMatcher.find()) {
return token;
}
} else if(!(token.getNormalizedNer().equals(prevDurationToken.getNormalizedNer())) || !(token.getId() - 1 == prevDurationToken.getId())) {
Matcher durationMatcher = DURATION_PATTERN.matcher(token.getNormalizedNer());
if(durationMatcher.find()) {
return token;
}
}
}
return null;
}
/** ***************************************************************
*/
public void generateDurationSumoTerms(IndexedWord tempParent, Utilities utilities, semRewrite.datesandnumber.DateInfo startDateInfo, DateInfo endDateInfo) {
if (tempParent != null) {
if (Utilities.VerbTags.contains(tempParent.tag())) {
utilities.sumoTerms.add("StartTime(" + tempParent.value()+"-"+tempParent.index() + "," + "time-" + startDateInfo.getTimeCount() + ")");
utilities.sumoTerms.add("EndTime(" + tempParent.value() +"-"+tempParent.index()+ "," + "time-" + endDateInfo.getTimeCount() + ")");
}
if (Utilities.nounTags.contains(tempParent.tag())) {
if (tempParent.ner().equals("PERSON")) {
utilities.sumoTerms.add("BirthDate(" + tempParent.value() +"-"+tempParent.index()+ "," + "time-" + startDateInfo.getTimeCount() + ")");
utilities.sumoTerms.add("DeathDate(" + tempParent.value() +"-"+tempParent.index()+ "," + "time-" + endDateInfo.getTimeCount() + ")");
}
else {
utilities.sumoTerms.add("StartTime(" + tempParent.value() +"-"+tempParent.index()+ "," + "time-" + startDateInfo.getTimeCount() + ")");
utilities.sumoTerms.add("EndTime(" + tempParent.value() +"-"+tempParent.index()+ "," + "time-" + endDateInfo.getTimeCount() + ")");
}
}
startDateInfo.setDurationFlag(true);
endDateInfo.setDurationFlag(true);
}
}
/** ***************************************************************
*/
public IndexedWord getAssociatedWord(Utilities utilities, IndexedWord tempParent) {
while (!tempParent.equals(utilities.StanfordDependencies.getFirstRoot())) {
tempParent = utilities.StanfordDependencies.getParent(tempParent);
if (Utilities.VerbTags.contains(tempParent.tag()) ||
Utilities.nounTags.contains(tempParent.tag())) {
break;
}
}
return tempParent;
}
/** ***************************************************************
*/
public void processUnhandledDuration(Utilities utilities) {
for (int i = 0; i < utilities.datesList.size() - 1; i++) {
if ((utilities.datesList.get(i).getEndIndex() + 2) == (utilities.datesList.get(i + 1).getWordIndex())) {
utilities.datesList.get(i).setDurationFlag(true);
utilities.datesList.get(i+1).setDurationFlag(true);
IndexedWord tempParent = utilities.StanfordDependencies.getNodeByIndex(utilities.datesList.get(i).getWordIndex());
tempParent = getAssociatedWord(utilities, tempParent);
generateDurationSumoTerms(tempParent, utilities, utilities.datesList.get(i), utilities.datesList.get(i+1));
}
}
}
}