All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.platon.pulsar.common.DateTimeDetector Maven / Gradle / Ivy

There is a newer version: 2.1.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package ai.platon.pulsar.common; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; import java.text.ParseException; import java.time.*; import java.util.Date; import java.util.List; import java.util.Set; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; /** * Detect a date time in a text. * TODO: see org.apache.commons.lang3.time.DateUtils * * TODO: too many bugs */ public class DateTimeDetector { public static final int MIN_DATE_TIME_STR_LENGTH = "2015-01-01 12:00".length(); public static final int MIN_YEAR_MONTH_STR_LENGTH = "201501".length(); public static final int MIN_DATE_STR_LENGTH = "20150101".length(); public static final int MAX_META_STR_LENGTH = 200; public static final int MAX_DATE_TIME_STR_LENGTH = "EEE, dd MMM yyyy HH:mm:ss zzz".length(); public static final int MAX_TITLE_LENGTH = 350; public static final String[] BAD_DATE_TIME_STRING_CONTAINS = new String[]{ "GMT+8", "UTC+8", "Processed", "访问", "刷新", "visit" }; public static final int OLD_DATE_DAYS = 30; public static final LocalDate CURRENT_DATE = LocalDate.now(); public static final long CURRENT_DATE_EPOCH_DAYS = CURRENT_DATE.toEpochDay(); public static final int CURRENT_YEAR = CURRENT_DATE.getYear(); public static final String CURRENT_YEAR_STR = String.valueOf(CURRENT_YEAR); public static final int CURRENT_MONTH = CURRENT_DATE.getMonthValue(); public static final int YEAR_LOWER_BOUND = 1990; public static final List VALID_WORK_YEARS = IntStream.range(2010, 2030) .mapToObj(String::valueOf).collect(Collectors.toList()); public static final List VALID_WORK_YEARS_SHORT = IntStream.range(10, 30) .mapToObj(String::valueOf).collect(Collectors.toList()); public static final String[] VALID_WORK_YEARS_ARRAY = VALID_WORK_YEARS.toArray(new String[0]); public static final String[] VALID_WORK_YEARS_SHORT_ARRAY = VALID_WORK_YEARS_SHORT.toArray(new String[0]); public static Set OLD_YEARS; public static Set OLD_MONTH; public static Pattern OLD_MONTH_URL_DATE_PATTERN; // 2016-03-05 20:07:51 // TODO : What's the difference between HH and hh? 24 hours VS 12 hours? public static String[] COMMON_DATE_FORMATS = new String[]{ "yyyyMMdd", "yyyy.MM.dd", "yyyy-MM-dd", "yyyy年MM月dd日", "yyyy/MM/dd", }; public static String[] COMMON_DATE_TIME_FORMATS = new String[]{ "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd'T'HH:mm:ss", "yyyy.MM.dd HH:mm:ss", "yyyy-MM-dd HH:mm:ss", "yyyy-MM-dd hh:mm:ss", "yyyy-MM-dd HH:mm", "yyyy-MM-dd hh:mm", "yyyy-MM-dd'T'HH:mm:ss'Z'", "yyyy年MM月dd日 HH:mm", "yyyy年MM月dd日 hh:mm", "yyyy年MM月dd日 HH:mm:ss", "yyyy年MM月dd日 hh:mm:ss", "yyyy/MM/dd HH:mm", "yyyy/MM/dd hh:mm", "yyyy/MM/dd HH:mm:ss", "yyyy/MM/dd hh:mm:ss", "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy zzz", "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", "EEE MMM dd HH:mm:ss zzz yyyy", "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz" }; static { // 1 years ago OLD_YEARS = IntStream.range(YEAR_LOWER_BOUND, CURRENT_YEAR).mapToObj(String::valueOf).collect(Collectors.toSet()); // 2 month ago OLD_MONTH = IntStream.range(1, CURRENT_MONTH - 1).mapToObj(m -> String.format("%02d", m)).collect(Collectors.toSet()); String monthPattern = StringUtils.join(OLD_MONTH, "|"); if (CURRENT_MONTH <= 2) { monthPattern = "\\d{2}"; } // eg : ".+2016[/\.-]?(01|02|03|04|05|06|07|08|09).+" OLD_MONTH_URL_DATE_PATTERN = Pattern.compile(".+" + CURRENT_YEAR + "[/\\.-]?(" + monthPattern + ").+"); } private final String[] dateFormats; private final String[] dateTimeFormats; private ZoneId zoneId; public DateTimeDetector() { this(COMMON_DATE_FORMATS, COMMON_DATE_TIME_FORMATS); } public DateTimeDetector(ZoneId zoneId) { this(COMMON_DATE_FORMATS, COMMON_DATE_TIME_FORMATS, zoneId); } public DateTimeDetector(String[] dateFormats, String[] dateTimeFormats) { this(dateFormats, dateTimeFormats, ZoneId.systemDefault()); } public DateTimeDetector(String[] dateFormats, String[] dateTimeFormats, ZoneId zoneId) { this.dateFormats = dateFormats; this.dateTimeFormats = dateTimeFormats; this.zoneId = zoneId; if (CURRENT_YEAR > 2030) { System.out.println("This program must be refined after 2030"); System.exit(2030); } } public ZoneId getZoneId() { return zoneId; } public void setZoneId(ZoneId zoneId) { this.zoneId = zoneId; } /** * Detect a possible date time string in a text */ public String detectPossibleDateTimeString(String text) { String possibleDate = StringUtils.substringBefore(text, "\n"); int dateStart = StringUtils.indexOfAny(possibleDate, VALID_WORK_YEARS_ARRAY); if (dateStart == StringUtils.INDEX_NOT_FOUND) { dateStart = StringUtils.indexOfAny(possibleDate, VALID_WORK_YEARS_SHORT_ARRAY); if (dateStart != StringUtils.INDEX_NOT_FOUND) { // OK for years possibleDate = "20" + possibleDate.substring(dateStart); if (possibleDate.matches("20[0-9][0-9][\\-\\./年]?\\d+.+")) { dateStart = 0; } } } if (dateStart < 0 || dateStart >= possibleDate.length()) { return null; } int dateEnd = Math.min(possibleDate.length(), dateStart + MAX_DATE_TIME_STR_LENGTH); possibleDate = possibleDate.substring(dateStart, dateEnd); possibleDate = possibleDate.replaceAll("[\\./年月]", "-").trim(); return possibleDate; } /** * Try to detect a best date time */ public OffsetDateTime detectDateTimeLeniently(String text) { OffsetDateTime dateTime = detectDateTime(text); if (dateTime == null) { dateTime = detectDate(text); } return dateTime; } public OffsetDateTime detectDateTime(String text) { if (text == null || text.length() < MIN_DATE_TIME_STR_LENGTH) { return null; } // \p{Z} or \p{Separator}: any kind of whitespace or invisible separator. text = text.replaceAll("\\p{Zs}", " ").trim(); // text = text.replaceAll("\\s+", " ").trim(); // May be automatically generated date time final String finalText = text; if (Stream.of(BAD_DATE_TIME_STRING_CONTAINS).anyMatch(finalText::contains)) { return null; } final int dateTimeStart = StringUtils.indexOfAny(text, VALID_WORK_YEARS_ARRAY); if (dateTimeStart == StringUtils.INDEX_NOT_FOUND) { return null; } // For example : "2017-12-20 11:18:35" final int dateEnd = StringUtils.indexOf(text, " ", dateTimeStart); if (dateEnd < 0) { // TODO : try getTextDocument date } // Find the datetime string's end final int timeStart = dateEnd + 1; int pos = timeStart; for (; pos < text.length(); ++pos) { Character ch = text.charAt(pos); if (!Character.isDigit(ch) && ch != ':') { break; } } final int dateTimeEnd = pos; String possibleDate = StringUtils.substring(text, dateTimeStart, dateTimeEnd); // try getTextDocument datetime OffsetDateTime dateTime = null; if (possibleDate.length() >= MIN_DATE_TIME_STR_LENGTH) { dateTime = parseDateTimeStrictly(possibleDate); } return dateTime; } public YearMonth detectYearMonth(String text) { String possibleYearMonth = detectPossibleDateTimeString(text); return possibleYearMonth != null ? tryParseYearMonthStrictly(possibleYearMonth) : null; } public YearMonth tryParseYearMonthStrictly(String possibleYearMonth) { try { final Pattern pattern = Pattern.compile("20[0-9][0-9][0-1][0-9].+"); final Pattern pattern2 = Pattern.compile("20[0-9][0-9]-[0-1]?[0-9].+"); if (pattern.matcher(possibleYearMonth).matches()) { possibleYearMonth = possibleYearMonth.substring(0, 4) + "-" + possibleYearMonth.substring(4, 6); return YearMonth.parse(possibleYearMonth); } else if (pattern2.matcher(possibleYearMonth).matches()) { String[] parts = possibleYearMonth.split("-"); if (parts.length >= 2 && parts[0].length() == 4) { if (parts[1].length() == 1) { parts[1] = "0" + parts[1]; } possibleYearMonth = parts[0] + "-" + parts[1]; return YearMonth.parse(possibleYearMonth); } } } catch (Throwable ignored) { } return null; } /** * Detect a date */ public OffsetDateTime detectDate(String text) { String possibleDate = detectPossibleDateTimeString(text); return possibleDate != null ? tryParseDateTimeStrictly(possibleDate) : null; } /** * Try parse a date time */ public OffsetDateTime tryParseDateTimeStrictly(String possibleDate) { try { final Pattern pattern = Pattern.compile("20[0-9][0-9][0-1][0-9][0-3][0-9].+"); final Pattern pattern2 = Pattern.compile("20[0-9][0-9]-[0-1]?[0-9]-[0-3]?[0-9].+"); if (pattern.matcher(possibleDate).matches()) { possibleDate = StringUtils.substring(possibleDate, 0, "yyyyMMdd".length()); } else if (pattern2.matcher(possibleDate).matches()) { String[] parts = possibleDate.split("-"); if (parts.length >= 3 && parts[0].length() == 4) { if (parts[1].length() == 1) { parts[1] = "0" + parts[1]; } if (parts[2].length() == 1) { parts[2] = "0" + parts[2]; } if (parts[2].length() > 2) { parts[2] = parts[2].substring(0, 2); } possibleDate = parts[0] + "-" + parts[1] + "-" + parts[2]; } } else { possibleDate = null; } return possibleDate == null ? null : parseDateStrictly(possibleDate, dateFormats); } catch (Throwable ignored) { } return null; } public OffsetDateTime parseDateStrictly(String dateStr) { return parseDateStrictly(dateStr, dateFormats); } public OffsetDateTime parseDateTimeStrictly(String dateStr) { return parseDateStrictly(dateStr, dateTimeFormats); } public OffsetDateTime parseDateStrictly(String dateStr, String... formats) { Date parsedDate = null; try { parsedDate = DateUtils.parseDateStrictly(dateStr, formats); } catch (ParseException ignored) { } return parsedDate == null ? null : OffsetDateTime.ofInstant(parsedDate.toInstant(), zoneId); } public Instant parseDateTimeStrictly(String dateStr, Instant defaultValue) { try { return DateUtils.parseDateStrictly(dateStr, dateTimeFormats).toInstant(); } catch (Throwable ignored) { } return defaultValue; } /** * For urls who contains date information, for example * http://bond.hexun.com/2011-01-07/126641872.html * * @param text a {@link java.lang.String} object. * @param days a int. * @param zoneId a {@link java.time.ZoneId} object. * @return a boolean. */ public boolean containsOldDate(String text, int days, ZoneId zoneId) { if (text == null) { return false; } YearMonth yearMonth = detectYearMonth(text); if (yearMonth == null) { return false; } OffsetDateTime dateTime = yearMonth.atEndOfMonth().atTime(0, 0).atZone(zoneId).toOffsetDateTime(); if (DateTimes.isDaysBefore(dateTime, days)) { return true; } dateTime = detectDate(text); return dateTime != null && DateTimes.isDaysBefore(dateTime, days); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy