org.apache.solr.update.processor.ParseDateFieldUpdateProcessorFactory Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update.processor;
import java.lang.invoke.MethodHandles;
import java.text.ParsePosition;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException;
import java.time.format.ResolverStyle;
import java.time.temporal.TemporalAccessor;
import java.time.temporal.TemporalQueries;
import java.util.Collection;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.schema.DateValueFieldType;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.LocaleUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Attempts to mutate selected fields that have only CharSequence-typed values into Date values.
* Solr will continue to index date/times in the UTC time zone, but the input date/times may be
* expressed using other time zones, and will be converted to an unambiguous {@link Date} when they
* are mutated.
*
* The default selection behavior is to mutate both those fields that don't match a schema field,
* as well as those fields that match a schema field with a date field type.
*
*
If all values are parseable as dates (or are already Date), then the field will be mutated,
* replacing each value with its parsed Date equivalent; otherwise, no mutation will occur.
*
*
One or more date "format" specifiers must be specified. See Java
* 8's DateTimeFormatter javadocs for a description of format strings. Note that "lenient" and
* case insensitivity is enabled. Furthermore, inputs surrounded in single quotes will be removed if
* found.
*
*
A default time zone name or offset may optionally be specified for those dates that don't
* include an explicit zone/offset. NOTE: three-letter zone designations like "EST" are not
* parseable (with the single exception of "UTC"), because they are ambiguous. If no default time
* zone is specified, UTC will be used. See Wikipedia's list of TZ
* database time zone names.
*
*
The locale to use when parsing field values using the specified formats may optionally be
* specified. If no locale is configured, then {@code en_US} will be used since it's implied by some
* well-known formats. Recent versions of Java have become sensitive to this. The following
* configuration specifies the French/France locale and two date formats that will parse the strings
* "le mardi 8 janvier 2013" and "le 28 déc. 2010 à 15 h 30", respectively. Note that either
* individual <str> elements or <arr>-s of <str> elements may be used to specify
* the date format(s):
*
*
* <processor class="solr.ParseDateFieldUpdateProcessorFactory">
* <str name="defaultTimeZone">Europe/Paris</str>
* <str name="locale">fr_FR</str>
* <arr name="format">
* <str>'le' EEEE dd MMMM yyyy</str>
* <str>'le' dd MMM. yyyy 'à' HH 'h' mm</str>
* </arr>
* </processor>
*
* See {@link Locale} for a description of acceptable language, country (optional) and variant
* (optional) values, joined with underscore(s).
*
*
Tip: you can use multiple instances of this URP in your chain with different locales or
* default time zones if you wish to vary those settings for different format patterns.
*
* @since 4.4.0
*/
public class ParseDateFieldUpdateProcessorFactory extends FieldMutatingUpdateProcessorFactory {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private static final String FORMATS_PARAM = "format";
private static final String DEFAULT_TIME_ZONE_PARAM = "defaultTimeZone";
private static final String LOCALE_PARAM = "locale";
private Map formats = new LinkedHashMap<>();
@Override
public UpdateRequestProcessor getInstance(
SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor next) {
return new AllValuesOrNoneFieldMutatingUpdateProcessor(getSelector(), next) {
final ParsePosition parsePosition = new ParsePosition(0);
@Override
protected Object mutateValue(Object srcVal) {
Object parsed = parsePossibleDate(srcVal, formats.values(), parsePosition);
return parsed != null ? parsed : SKIP_FIELD_VALUE_LIST_SINGLETON;
}
};
}
@Override
public void init(NamedList args) {
Locale locale;
String localeParam = (String) args.remove(LOCALE_PARAM);
if (null != localeParam) {
locale = LocaleUtils.toLocale(localeParam);
} else {
locale = Locale.US; // because well-known patterns assume this
}
Object defaultTimeZoneParam = args.remove(DEFAULT_TIME_ZONE_PARAM);
ZoneId defaultTimeZone = ZoneOffset.UTC;
if (null != defaultTimeZoneParam) {
defaultTimeZone = ZoneId.of(defaultTimeZoneParam.toString());
}
Collection formatsParam = args.removeConfigArgs(FORMATS_PARAM);
if (null != formatsParam) {
for (String value : formatsParam) {
DateTimeFormatter formatter =
new DateTimeFormatterBuilder()
.parseLenient()
.parseCaseInsensitive()
.appendPattern(value)
.toFormatter(locale)
.withResolverStyle(ResolverStyle.LENIENT)
.withZone(defaultTimeZone);
validateFormatter(formatter);
formats.put(value, formatter);
}
}
super.init(args);
}
/**
* Returns true if the field doesn't match any schema field or dynamic field, or if the matched
* field's type is BoolField
*/
@Override
public FieldMutatingUpdateProcessor.FieldNameSelector getDefaultSelector(final SolrCore core) {
return fieldName -> {
final IndexSchema schema = core.getLatestSchema();
FieldType type = schema.getFieldTypeNoEx(fieldName);
return (null == type) || type instanceof DateValueFieldType;
};
}
public static Object parsePossibleDate(
Object srcVal, Collection parsers, ParsePosition parsePosition) {
if (srcVal instanceof CharSequence) {
String srcStringVal = srcVal.toString();
// trim single quotes around date if present
// see issue #5279 (Apache HttpClient)
int stringValLen = srcStringVal.length();
if (stringValLen > 1 && srcStringVal.startsWith("'") && srcStringVal.endsWith("'")) {
srcStringVal = srcStringVal.substring(1, stringValLen - 1);
}
for (DateTimeFormatter parser : parsers) {
try {
return Date.from(parseInstant(parser, srcStringVal, parsePosition));
} catch (DateTimeParseException e) {
if (log.isDebugEnabled()) {
log.debug("value '{}' is not parseable with format '{}'", srcStringVal, parser);
}
}
}
log.debug(
"value '{}' was not parsed by any configured format, thus was not mutated", srcStringVal);
return null;
}
if (srcVal instanceof Date) {
return srcVal;
}
return null;
}
public static void validateFormatter(DateTimeFormatter formatter) {
// check it's valid via round-trip
try {
parseInstant(formatter, formatter.format(Instant.now()), new ParsePosition(0));
} catch (Exception e) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"Bad or unsupported pattern: " + formatter.toFormat().toString(),
e);
}
}
// see https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8177021 which is fixed in Java 9.
// The upshot is that trying to use parse(Instant::from) is unreliable in the event that
// the input string contains a timezone/offset that differs from the "override zone"
// (which we configure in DEFAULT_TIME_ZONE). Besides, we need the code below which handles
// the optionality of time. Were it not for that, we truly could do
// formatter.parse(Instant::from).
private static Instant parseInstant(
DateTimeFormatter formatter, String dateStr, ParsePosition parsePosition) {
// prepare for reuse
parsePosition.setIndex(0);
parsePosition.setErrorIndex(-1);
final TemporalAccessor temporal = formatter.parse(dateStr, parsePosition);
// check that all content has been parsed
if (parsePosition.getIndex() < dateStr.length()) {
final String abbr;
if (dateStr.length() > 64) {
abbr = dateStr.subSequence(0, 64).toString() + "...";
} else {
abbr = dateStr;
}
throw new DateTimeParseException(
"Text '"
+ abbr
+ "' could not be parsed, unparsed text found at index "
+ parsePosition.getIndex(),
dateStr,
parsePosition.getIndex());
}
// Get Date; mandatory
LocalDate date = temporal.query(TemporalQueries.localDate()); // mandatory
if (date == null) {
throw new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"Date (year, month, day) is mandatory: " + formatter.toFormat().toString());
}
// Get Time; optional
LocalTime time = temporal.query(TemporalQueries.localTime());
if (time == null) {
time = LocalTime.MIN;
}
final LocalDateTime localDateTime = LocalDateTime.of(date, time);
// Get Zone Offset; optional
ZoneOffset offset = temporal.query(TemporalQueries.offset());
if (offset == null) {
// no Zone offset; get Zone ID
ZoneId zoneId = temporal.query(TemporalQueries.zone());
if (zoneId == null) {
zoneId = formatter.getZone();
if (zoneId == null) {
zoneId = ZoneOffset.UTC;
}
}
return localDateTime.atZone(zoneId).toInstant();
} else {
return localDateTime.toInstant(offset);
}
}
}