edu.isi.nlp.temporal.Timex2Time Maven / Gradle / Ivy
package edu.isi.nlp.temporal;
import static com.google.common.base.Objects.equal;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import edu.isi.nlp.symbols.Symbol;
import edu.isi.nlp.symbols.SymbolUtils;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.joda.time.Interval;
import org.joda.time.Period;
import org.joda.time.format.ISODateTimeFormat;
import org.joda.time.format.ISOPeriodFormat;
import org.joda.time.format.PeriodFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents a time value according to "TIDES Instruction Manual for the Annotation of Temporal
* Expressions", Lisa Ferro, 2001. Please refer to that document for details.
*
* Beware: full validity checking is not done, so it is possible to construct invalid Timex
* expressions.
*
*
Equality is determined strictly by field equality and not by any comparison of the time
* periods denoted.
*
*
This currently only supports durations of the form \dE, \dC, \DL, or ISO 8601 periods.
* Durations which mix and match these (which should be incredibly rare) will act as if they have no
* duration. See TIMEX2 specification section 3.2.4.1.
*
* @author Ryan Gabbard, Feb. 2014
*/
public final class Timex2Time {
private static final Logger log = LoggerFactory.getLogger(Timex2Time.class);
public static final ImmutableSet nonISOMarkers =
SymbolUtils.setFrom(
"WE", "MO", "MI", "AF", "EV", "NI", "PM", "DT", "FA", "WI", "SP", "SU", "FY", "Q1", "Q2",
"Q3", "Q4", "QX", "H1", "H2", "HX", "BC", "KA", "MA", "GA");
private static final Pattern nonISOMarkersPattern =
Pattern.compile("(" + Joiner.on("|").join(nonISOMarkers) + ")");
public enum Modifier {
BEFORE,
AFTER,
ON_OR_BEFORE,
ON_OR_AFTER,
LESS_THAN,
MORE_THAN,
EQUAL_OR_LESS,
EQUAL_OR_MORE,
START,
MID,
END,
APPROX;
}
public enum AnchorDirection {
STARTING,
ENDING,
WITHIN,
BEFORE,
AFTER,
AS_OF;
}
private static final Symbol PRESENT_REF = Symbol.from("PRESENT_REF");
private static final Symbol PAST_REF = Symbol.from("PAST_REF");
private static final Symbol FUTURE_REF = Symbol.from("FUTURE_REF");
// keeping for backward compatibility
public static final Symbol BEFORE = Symbol.from(AnchorDirection.BEFORE.name());
public static final Symbol AFTER = Symbol.from(AnchorDirection.AFTER.name());
public static final Symbol AS_OF = Symbol.from(AnchorDirection.AS_OF.name());
private final Symbol val;
private final Modifier mod;
private final boolean set;
private final Symbol granularity;
private final Symbol periodicity;
private final Symbol anchorVal;
private final AnchorDirection anchorDir;
private final boolean nonSpecific;
// Joda-time interpetation
// This is a derived field (derived from val if val denotes a period)
private final Period duration;
public boolean isNonSpecific() {
return nonSpecific;
}
public Optional value() {
return Optional.fromNullable(val);
}
public Optional modifier() {
return Optional.fromNullable(mod);
}
public boolean isSet() {
return set;
}
public Optional granularity() {
return Optional.fromNullable(granularity);
}
public Optional periodicity() {
return Optional.fromNullable(periodicity);
}
public Optional anchorValue() {
return Optional.fromNullable(anchorVal);
}
public Optional anchorDirection() {
return Optional.fromNullable(anchorDir);
}
public boolean isReferenceToPresent() {
return val == PRESENT_REF;
}
public boolean isReferenceToPast() {
return val == PAST_REF;
}
public boolean isReferenceToFuture() {
return val == FUTURE_REF;
}
public boolean isDuration() {
return duration != null;
}
public Optional duration() {
return Optional.fromNullable(duration);
}
public Optional anchorAsInterval() {
if (anchorVal != null) {
return parseValueAsInterval(anchorVal.toString());
}
return Optional.absent();
}
public Optional valueAsInterval() {
if (val != null) {
return parseValueAsInterval(val.toString());
}
return Optional.absent();
}
public static Timex2Time createEmpty() {
return new Timex2Time(null, null, false, null, null, null, null, false);
}
public static Timex2Time present() {
return new Timex2Time(PRESENT_REF, null, false, null, null, null, null, false);
}
public static Timex2Time past() {
return new Timex2Time(PAST_REF, null, false, null, null, null, null, false);
}
public static Timex2Time future() {
return new Timex2Time(FUTURE_REF, null, false, null, null, null, null, false);
}
public Builder copyBuilder() {
Builder builder = new Builder();
if (this.set) {
builder.withIsSet(true);
}
if (this.periodicity().isPresent()) {
builder.withPeriodicity(this.periodicity);
}
if (this.granularity().isPresent()) {
builder.withGranularity(this.granularity);
}
if (this.isNonSpecific()) {
builder.setNonSpecific(true);
}
if (this.anchorValue().isPresent()) {
builder.withAnchorValue(this.anchorVal);
}
if (this.anchorDirection().isPresent()) {
builder.withAnchorDirectionFromEnum(this.anchorDir);
}
if (this.value().isPresent()) {
builder.withVal(this.val);
}
return builder;
}
// Timex stores boolean values as YES or empty
private static final Symbol YES = Symbol.from("YES");
private static boolean parseYES(final Symbol s) throws Timex2Exception {
if (s == YES) {
return true;
}
throw new Timex2Exception(String.format("Invalid boolean flag %s. Must be YES or absent.", s));
}
private static final ImmutableMap periodPatternsToYearMultipliers =
ImmutableMap.builder()
.put(
// Decades-2001 specification
Pattern.compile("(\\d+)E"), 10)
.put(
// Centuries-2001 specification
Pattern.compile("(\\d+)C"), 100)
.put(
// Millenia-2001 specification
Pattern.compile("(\\d+)L"), 1000)
.put(
// Decades-2005 specification
Pattern.compile("(\\d+)DE"), 10)
.put(
// Centuries-2005 specification
Pattern.compile("(\\d+)CE"), 100)
.put(
// Millenia-2005 specification
Pattern.compile("(\\d+)ML"), 1000)
.build();
private static final PeriodFormatter ISO8601_PERIOD_PARSER = ISOPeriodFormat.standard();
/**
* This method makes a "best effort" to make a {@link Period} out of a TIMEX2 period string.
* TIMEX2 periods are superset of ISO 8601 periods. For example, TIMEX2 periods can have strings
* like SU (for summer), FY (for fiscal year) or X for unspecified numbers (like PXM for
* unspecified no. of months). This method will return Optional.absent() for any such non-ISO8601
* Period values.
* The only special consideration is with decade,century or millenium markers (E,C,L in Timex2001;
* CE,DE,ML in Timex2005). These will be multiplied by appropriate multipliers to get the right
* period. For details refer to sections 4.2 and 4.3 of Timex2005 specification.
*
* @param periodSym period-value as Symbol
* @return Optional {@link Period} object (see the description for when Optional.absent will be
* returned).
* @throws Timex2Exception if periodSym is not a valid period string (e.g. it doesn't start with
* P)
* @author rgabbard
*/
private Optional parseDuration(final Symbol periodSym) throws Timex2Exception {
checkNotNull(periodSym);
final String period = periodSym.toString();
if (!period.startsWith("P")) {
return Optional.absent();
}
for (final Map.Entry timexExtension :
periodPatternsToYearMultipliers.entrySet()) {
final Matcher m = timexExtension.getKey().matcher(period);
try {
if (m.lookingAt()) {
return Optional.of(
Period.years(timexExtension.getValue() * Integer.parseInt(m.group(1))));
}
} catch (final NumberFormatException nfe) {
log.warn("Failed to parse duration {}, skipping. Due to {}", period, nfe);
return Optional.absent();
}
}
try {
return Optional.of(ISO8601_PERIOD_PARSER.parsePeriod(period));
} catch (final IllegalArgumentException iae) {
return Optional.absent();
}
}
private static final Joiner dashJoiner = Joiner.on("-");
@Override
public String toString() {
final List parts = Lists.newArrayList();
if (val != null) {
parts.add(val.toString());
}
if (mod != null) {
parts.add(mod.toString());
}
if (set) {
parts.add("SET");
}
if (granularity != null) {
parts.add(granularity.toString());
}
if (periodicity != null) {
parts.add(periodicity.toString());
}
if (anchorVal != null) {
parts.add(anchorVal.toString());
}
if (anchorDir != null) {
parts.add(anchorDir.toString());
}
if (nonSpecific) {
parts.add("NONSPECIFIC");
}
return dashJoiner.join(parts);
}
private Timex2Time(
final Symbol val,
final Modifier mod,
final boolean set,
final Symbol granularity,
final Symbol periodicity,
final Symbol anchorVal,
final AnchorDirection anchorDir,
final boolean nonSpecific) {
this.val = val;
this.mod = mod;
if (mod != null) {
checkNotNull(val, "Value cannot be null if mod is %s", mod);
}
this.set = set;
this.granularity = granularity;
if (granularity != null) {
// granularity always appears in conjunction with set
checkArgument(set);
}
this.periodicity = periodicity;
if (periodicity != null) {
// periodicity always appears in conjunction with set
checkArgument(set);
}
this.anchorVal = anchorVal;
this.anchorDir = anchorDir;
this.nonSpecific = nonSpecific;
if (val != null) {
this.duration = parseDuration(val).orNull();
} else {
this.duration = null;
}
}
@Override
public int hashCode() {
return Objects.hashCode(
val, mod, set, granularity, periodicity, anchorVal, anchorDir, nonSpecific);
}
@Override
public boolean equals(final Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final Timex2Time other = (Timex2Time) obj;
return equal(val, other.val)
&& equal(mod, other.mod)
&& equal(set, other.set)
&& equal(granularity, other.granularity)
&& equal(periodicity, other.periodicity)
&& equal(anchorVal, other.anchorVal)
&& equal(anchorDir, other.anchorDir)
&& equal(nonSpecific, other.nonSpecific);
}
/**
* This method makes a "best effort" to make an {@link Interval} out of a TIMEX2 value string.
* TIMEX2 value (or date-time) strings are superset of ISO 8601 periods. For example, TIMEX2
* values can have seasons (like SU for summer, FA for fall, etc) or periods of day (like MO for
* morning, NI for night), or unspecified values like 1999-09-XX (See section 4.3 of the
* specification). For such values that make the intervals fuzzy, we will try to find the interval
* upto which the timex value is specific. For example, 1999-09-08TNI will return the interval for
* 1999-09-08, and 1999-FA or 1999-WXX will return the interval for 1999. However, we reserve the
* right to make specific inferences from non-ISO markers in future implementations (e.g. FA could
* mean the period from third week of September to third week of December).
* TIMEX2 values with omissions, like VAL="199" (meaning the decade of 1990s) or VAL="20" (meaning
* the 21st century) will be converted to approriate interval values (10 years or 100 years
* respectively, for this example).
* If no interval can be discerned from the value at all, for example, XXXX-09 (September of
* unspecified year) or XX63 (63rd year of unspecified year), Optional.absent() will be returned.
*
* @param valSym timex-value as Symbol
* @return Optional {@link Interval} object (see the description for when Optional.absent will be
* returned).
* @author rgabbard, msrivast
*/
private Optional parseValueAsInterval(Symbol valSym) {
String val = valSym.asString();
// first see if time of day is fuzzy (TMO, TNI, TXX etc.), and if so, get rid of that
String timePart = "";
if (val.contains("T")) {
timePart = val.substring(val.indexOf("T") + 1);
if (timePart.contains("X") || nonISOMarkersPattern.matcher(timePart).find()) {
timePart = "";
}
val = val.substring(0, val.indexOf("T"));
}
val = val + (!timePart.equals("") ? "T" + timePart : "");
// now get rid of unspecified markers (X) or weekend, season, FY etc. markers.
// Timex2 standard says that it extends ISO 8601 for vals which means basic ISO 8601 format
// may also be allowed. All the examples that I have seen have always had hyphens tho.
// Therefore, for simplicity, we will split val on hyphens ~msrivast
String[] valParts = val.split("-");
ImmutableList.Builder specificValParts = ImmutableList.builder();
for (int i = 0; i < valParts.length; i++) {
String part = valParts[i];
// if part contains X or is a season, fiscal year, periods of day marker etc. break, since the
// timex value can be specific only upto here
if (part.contains("X") || nonISOMarkers.contains(Symbol.from(part))) {
break;
}
specificValParts.add(part);
}
val = Joiner.on("-").join(specificValParts.build());
// it could be a simple date which we can extract
// directly (very common; needs to come after the above because
// its regex will match week-based dates, too)
try {
return Optional.of(
ISODateTimeFormat.yearMonthDay().parseDateTime(val).dayOfMonth().toInterval());
} catch (final IllegalArgumentException iae) {
// pass
}
// could be YYYY-MM type date
try {
return Optional.of(
ISODateTimeFormat.yearMonth().parseDateTime(val).monthOfYear().toInterval());
} catch (final IllegalArgumentException iae) {
// pass
}
// could be a YYYY type date
try {
return Optional.of(ISODateTimeFormat.year().parseDateTime(val).year().toInterval());
} catch (final IllegalArgumentException iae) {
// pass
}
// it could be a 2014-W3-01 style week-based date
try {
return Optional.of(
ISODateTimeFormat.weekyearWeekDay().parseDateTime(val).dayOfMonth().toInterval());
} catch (final IllegalArgumentException iae) {
// it's okay if it's unparseable; just wasn't in this format
}
// it would be a 2014-W3 style week-based date
try {
return Optional.of(
ISODateTimeFormat.weekyearWeek().parseDateTime(val).weekOfWeekyear().toInterval());
} catch (final IllegalArgumentException iae) {
// it's okay if it's unparseable; just wasn't in this format
}
// this needs to go last or it would short-circuit the week
// formats
try {
return Optional.of(
ISODateTimeFormat.dateTimeParser().parseDateTime(val).minuteOfDay().toInterval());
} catch (final IllegalArgumentException iae) {
// pass
}
// value was not parseable as a joda time Interval--see if the first one, two or three
// characters of val
// are digits, which would address the cases like 1,19,199
String digits = "";
for (int i = 0; i < 3 && i < val.length() && Character.isDigit(val.charAt(i)); i++) {
digits += val.substring(i, i + 1);
}
if (!digits.equals("")) {
int period = (int) Math.pow(10, 4 - digits.length());
int startYear = Integer.parseInt(digits) * period;
int endYear = startYear + period;
return Optional.of(Interval.parse(startYear + "/" + endYear));
}
log.warn("Value not parseable as interval: {}", val);
return Optional.absent();
}
private Optional parseValueAsInterval(String val) {
return parseValueAsInterval(Symbol.from(val));
}
public static Builder builder() {
return new Builder();
}
public static class Builder {
private Builder() {}
private Symbol val;
private Modifier mod = null;
private boolean isSet = false;
private Symbol granularity = null;
private Symbol periodicity = null;
private Symbol anchorVal = null;
private AnchorDirection anchorDir = null;
private boolean nonSpecific = false;
public Timex2Time build() {
return new Timex2Time(
val, mod, isSet, granularity, periodicity, anchorVal, anchorDir, nonSpecific);
}
public Builder withVal(Symbol val) {
this.val = checkNotNull(val);
return this;
}
public Builder withVal(String val) {
this.val = Symbol.from(checkNotNull(val));
return this;
}
public Builder withModifier(Modifier mod) {
this.mod = checkNotNull(mod);
return this;
}
public Builder withModifierFromString(String mod) {
return withModifier(Modifier.valueOf(checkNotNull(mod)));
}
public Builder withIsSetFromTimexBoolean(Symbol timexBoolean) {
this.isSet = parseYES(timexBoolean);
return this;
}
public Builder withIsSet(boolean isSet) {
this.isSet = isSet;
return this;
}
public Builder withGranularity(Symbol granularity) {
this.granularity = checkNotNull(granularity);
return this;
}
public Builder withGranularity(String granularity) {
this.granularity = Symbol.from(checkNotNull(granularity));
return this;
}
public Builder withPeriodicity(Symbol periodicity) {
this.periodicity = checkNotNull(periodicity);
return this;
}
public Builder withPeriodicity(String periodicity) {
this.periodicity = Symbol.from(checkNotNull(periodicity));
return this;
}
public Builder withAnchorValue(Symbol anchorValue) {
this.anchorVal = checkNotNull(anchorValue);
return this;
}
public Builder withAnchorValue(String anchorValue) {
this.anchorVal = Symbol.from(checkNotNull(anchorValue));
return this;
}
public Builder withAnchorDirection(Symbol anchorDir) {
checkNotNull(anchorDir);
this.anchorDir = AnchorDirection.valueOf(anchorDir.asString());
return this;
}
public Builder withAnchorDirection(String anchorDir) {
checkNotNull(anchorDir);
this.anchorDir = AnchorDirection.valueOf(anchorDir);
return this;
}
public Builder withAnchorDirectionFromEnum(AnchorDirection anchorDir) {
this.anchorDir = checkNotNull(anchorDir);
return this;
}
public Builder setNonSpecificFromTimexBoolean(Symbol timexBoolean) {
this.nonSpecific = parseYES(timexBoolean);
return this;
}
public Builder setNonSpecific(boolean nonSpecific) {
this.nonSpecific = nonSpecific;
return this;
}
}
}