nt.opensextant-xponents-core.3.3.6.source-code.datetime_patterns.cfg Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensextant-xponents-core Show documentation
Show all versions of opensextant-xponents-core Show documentation
An information extraction toolkit focused on geography and temporal entities
/**
* NOTICE
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* **************************************************************************
* NOTICE
* This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2009-2013 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
// ALL Patterns below - defines, rules, etc. -- are for MATCHING.
// Parsing of actual fields named in defines is done after matches are found.
// Validation of parsed fields is last.
# Well-known month abbreviations.
#DEFINE MON_ABBREV JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEPT?|OCT|NOV|DEC
# A name starting with 3 ASCII letters as above, but followed by other letters, possibly not English or ASCII.
# Detection of month/day/year patterns with non-English month names is only a coincidence if they share a common prefix.
# Locales for date patterns and language options could be explored further. But is beyond scope.
#DEFINE MON_NAME JAN\p{Alpha}{0,6}|FEB\p{Alpha}{0,6}|MAR\p{Alpha}{0,2}|APR\p{Alpha}{0,3}|MAY|JUN\p{Alpha}{0,2}|JUL\p{Alpha}{0,2}|AUG\p{Alpha}{0,3}|SEP\p{Alpha}{0,6}|OCT\p{Alpha}{0,6}|NOV\p{Alpha}{0,6}|DEC\p{Alpha}{0,6}
// #DEFINE MON_NAME [A-Z]{3}\w{0,8}
#DEFINE DAY_ENUM th|nd|rd|st
# Fixed length fields
// In all practicality, year is 1xxx or 2xxx. Years 0001 to 0999 not really considered.
#DEFINE YEAR [12]\d{3}
#DEFINE YY '?\d\d
// Year/YY is 2-4 digits,... but could be 3. This is only used for matching. XTemp still validates matches.
// '76
//
#DEFINE YEARYY '?\d{2}|\d{4}
#DEFINE MM [01]\d
#DEFINE DD [0-3]\d
#DEFINE SHORT_TZ [A-Z]
#DEFINE hh [0-2]\d
#DEFINE mm [0-5]\d
#DEFINE ss [0-5]\d
# Variable length
#DEFINE DOM [0-3]?\d
#DEFINE MONTH [01]?\d
#DEFINE LONG_TZ [A-Z]{3,5}
#DEFINE OF of|Of|OF
# Do not use DSEP? (that is an optional separator.) Most field-separated patterns would be too noisy.
#DEFINE DSEP1 [-/.]
#DEFINE DSEP2 [-/.]
// ........................................
// Month, Day, Year patterns, MDY
// ........................................
// FORM: DATE: MM/DD/YY
#RULE MDY 01 \b//\b
#TEST MDY 01 12/30/90
#TEST MDY 01 DATE: 12/30/90
#TEST MDY 01 13/30/90 FAIL bad MON
#TEST MDY 01 12/32/90 FAIL bad DOM
#TEST MDY 01 12/30/01
#TEST MDY 01 12/30/00
#TEST MDY 01 12/30/55
#TEST MDY 01 12/30/15
// FORM: DATE: MM/DD/YYYY
#RULE MDY 02 \b//\b
#TEST MDY 02 12/30/1990
#TEST MDY 02 DATE: 12/30/1990
#TEST MDY 02 13/30/1990 FAIL bad MON
#TEST MDY 02 12/32/1990 FAIL bad DOM
#TEST MDY 02 12/30/2001
#TEST MDY 02 12/30/0000 FAIL bad YYYY
#TEST MDY 02 12/30/1955
#TEST MDY 02 12/30/1915
// FORM: MMM DD, YYYY or MMM DD YYYY, MMM DD, YY, etc.
#RULE MDY 03 \b[\s.]+[\s,.]+\b
#TEST MDY 03 DEC 30, 1990
#TEST MDY 03 DEC 30 1990
#TEST MDY 03 DEC.30 1990
#TEST MDY 03 DEC. 30 1990
#TEST MDY 03 DEC.30.1990
#TEST MDY 03 DEC 30 90
#TEST MDY 03 DEC 30 990 FAIL bad year
#TEST MDY 03 DEC 00 1990 FAIL bad DOM
#TEST MDY 03 DEC 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected
#TEST MDY 03 DECEMBER 30 1990
#TEST MDY 03 DECMEBER 30 90
#TEST MDY 03 DECIEMBRE 30 1990
#TEST MDY 03 DECIEMBRE 00 1990 # FAIL no 00 day
#TEST MDY 03 DECEMBER 01 2300 FAIL ambiguous; time appears to be 2300 where year is expected
// FORM: MMM, YYYY or Month, YYYY comma optional. 4-digit year required
#RULE MDY 04 \b[\s,.]+\b
#TEST MDY 04 DEC 1990
#TEST MDY 04 DEC, 1990
#TEST MDY 04 DEC. 1990
#TEST MDY 04 DECEMBER, 1990
#TEST MDY 04 DECIEMBRE, 1990
// FORM: MMM of YYYY
#RULE MDY 04a \b\s+\s+\b
#TEST MDY 04a DEC of 1990
#TEST MDY 04a DECEMBER of 1990
#RULE MDY 05 \b\s+[\s,]+\b
#TEST MDY 05 30 DEC 1990
#TEST MDY 05 30 DEC 90
#TEST MDY 05 01 DEC 00
#TEST MDY 05 01 DEC 02
#TEST MDY 05 30 DECEMBER 1990
#TEST MDY 05 30 DECMEBER 1990
#TEST MDY 05 30 DECIEMBRE 1990
#RULE MDY 06a \b[\s.]+[\s,]+\b
#TEST MDY 06a September 19th, 2017
#TEST MDY 06a September 19th, 17 # FAIL
#TEST MDY 06a September 19 th, 17 # FAIL
#TEST MDY 06a Sept. 19th, 2017
#TEST MDY 06a Sept 19th, 2017
#TEST MDY 06a Sept 1st, 2017
#TEST MDY 06a Sept 23rd, 2017
#TEST MDY 06a Sept 15th, 2017
#TEST MDY 06a Sept 22nd, 2017
#RULE MDY 06b \b\s+?\s*[\s,]+\b
#TEST MDY 06b 19th September, 2017
#TEST MDY 06b 19th of September, 2017
#RULE DMYT 01 \b\s+[\s,]+ :?\b
#TEST DMYT 01 30 DEC 1990 0400
#TEST DMYT 01 30 DEC 90 0400
#TEST DMYT 01 11 JUN 14 1815 06:15 PM, 11 JUNE 2014
#TEST DMYT 01 25 March 2012 04:00
#TEST DMYT 01 25 March, 2012 04:00
// FORM: DATE: DD-MON-YYYY
#RULE DMY 01 \b--\b
#TEST DMY 01 12-DEC-90
#TEST DMY 01 12-DEC-1990
// FORM: DATE: DD MON YYYY
#RULE DMY 02 \b\s*\s*\b
#TEST DMY 02 12 DEC 90
#TEST DMY 02 12 DEC 1990
#TEST DMY 02 12DEC90
#TEST DMY 02 12DEC1990
#TEST DMY 02 12MARCH1999
#TEST DMY 02 12FEBBRAIO1999
#TEST DMY 02 12JUL1999
#TEST DMY 02 12JULIO1999
// FORM: DATE: YYYY-MM-DD as it appears in free text of documents.
// The limitations of this pattern are related to how it was used.
// This is a relatively modern format; was this format used in text in 1700s?
#RULE YMD 01 \b\b
#TEST YMD 01 2001-11-11
#TEST YMD 01 0001-04-34 # FAIL
#TEST YMD 01 1001-04-30 # FAIL
#TEST YMD 01 2001-04-30
#TEST YMD 01 1990-04-30
#TEST YMD 01 1790-04-30 # FAIL -- 1800 01 01 is earliest date for this pattern.
#TEST YMD 01 a2001-04-30 # FAIL
#TEST YMD 01 c2001-04-30 # FAIL
#TEST YMD 01 42001-04-30 # FAIL
// ........................................
// DATE TIME PATTERNS, DTM
// ........................................
// FORM: A|O|P|R DDHHMMZ MMM YY
#RULE DTM 01 \b\s*\s*\b
#TEST DTM 01 A 301400Z DEC 90
#TEST DTM 01 R 301400Z DEC 90
#TEST DTM 01 A 351400Z DEC 90 # FAIL day out of range
// FORM: YYYYMMDDTHHMMZ
#RULE DTM 02 \bT\b
#TEST DTM 02 20101230T1400Z
// FORM: YYYY-MM-DDTHH:MM:SS ... ISO Time. ISO 8601 uses "-", not "/". But should be validated in normalization.
// DTM 04 is collapsed into this pattern.
#RULE DTM 03 \b[T ]::
#TEST DTM 03 2010-12-30T14:00:01:12
#TEST DTM 03 2010-12-30T14:00:02
// FORM: YYYY-MM-DDTHH:MM ... ISO Time. See 03 above.
#RULE DTM 03b \b[T ]:
#TEST DTM 03b 2010-12-30T14:01:11:12
#TEST DTM 03b 2010-12-30T14:02:12
#TEST DTM 03b 2010-12/30T14:03:13 # FAIL
// FORM: MM/DD/YY* HH:MM:SS.
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
#RULE DTM 05a \b\s?::
#TEST DTM 05a 12-30-20 14:00:00:12 # extra data associated with value?
#TEST DTM 05a 12-30-2020 14:00:01
#TEST DTM 05a 12/30-2020 14:00:02 # FAIL Test mixed punctuation.
#TEST DTM 05a 12/30/2020 14:00:02 # Test mixed punctuation.
#TEST DTM 05a 12/30/20 14:00 # Test mixed punctuation.
// FORM: MM/DD/YY* HH:MM.
// TODO: 12-hour clock time and detect PM/AM. This HH:MM is only 24 hour clock.
#RULE DTM 05b \b\s?:
#TEST DTM 05b 12-30-20 14:00:00:12 # extra data associated with value?
#TEST DTM 05b 12-30-2020 14:01:01
#TEST DTM 05b 12/30-2020 14:02:02 # FAIL Test mixed punctuation.
#TEST DTM 05b 12/30/2020 14:03:02 # Test mixed punctuation.
#TEST DTM 05b 12/30/20 14:04 # Test mixed punctuation.
#TEST DTM 05b 12.30.20 14:04 # European convention for Date.
#TEST DTM 05b 2.30.20 14:04 # FAIL No 30 FEB. European convention for Date. Non-zero padded
#TEST DTM 05b 2.30/2020 14:04 # FAIL No 30 FEB., Mixed Separators. European convention for Date. Non-zero padded.
#TEST DTM 05b 4.30.20 14:04 # European convention for Date. Non-zero padded
#TEST DTM 05b 4.30/2020 14:04 # FAIL Mixed Separators. European convention for Date. Non-zero padded.
// 8-digit date is great when used in short spans of text; But as a general pattern, this matches any 8 digit number.
// Not very accurate on data or large texts.
// FORM: YYYYMMDD
//#RULE DTM xx \b\b
//#TEST DTM xx 20101230
© 2015 - 2024 Weber Informatics LLC | Privacy Policy