All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.xpack.core.ml.job.config.DataDescription Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0; you may not use this file except in compliance with the Elastic License
 * 2.0.
 */
package org.elasticsearch.xpack.core.ml.job.config;

import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.xcontent.ObjectParser;
import org.elasticsearch.common.xcontent.ToXContentObject;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.xpack.core.ml.job.messages.Messages;
import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
import org.elasticsearch.xpack.core.ml.utils.time.DateTimeFormatterTimestampConverter;

import java.io.IOException;
import java.time.ZoneOffset;
import java.util.Locale;
import java.util.Objects;

/**
 * Describes the format of the data used in the job and how it should
 * be interpreted by autodetect.
 * 

* Data must either be in a textual delineated format (e.g. csv, tsv) or JSON * the {@linkplain DataFormat} enum indicates which. {@link #getTimeField()} * is the name of the field containing the timestamp and {@link #getTimeFormat()} * is the format code for the date string in as described by * {@link java.time.format.DateTimeFormatter}. The default quote character for * delineated formats is {@value #DEFAULT_QUOTE_CHAR} but any other character can be * used. */ public class DataDescription implements ToXContentObject, Writeable { /** * Enum of the acceptable data formats. */ public enum DataFormat implements Writeable { XCONTENT, DELIMITED; /** * Delimited used to be called delineated. We keep supporting that for backwards * compatibility. */ private static final String DEPRECATED_DELINEATED = "DELINEATED"; /** * Case-insensitive from string method. * Works with either JSON, json, etc. * * @param value String representation * @return The data format */ public static DataFormat forString(String value) { String valueUpperCase = value.toUpperCase(Locale.ROOT); return DEPRECATED_DELINEATED.equals(valueUpperCase) ? DELIMITED : DataFormat .valueOf(valueUpperCase); } public static DataFormat readFromStream(StreamInput in) throws IOException { return in.readEnum(DataFormat.class); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeEnum(this); } @Override public String toString() { return name().toLowerCase(Locale.ROOT); } } public static final ParseField DATA_DESCRIPTION_FIELD = new ParseField("data_description"); public static final ParseField FORMAT_FIELD = new ParseField("format"); public static final ParseField TIME_FIELD_NAME_FIELD = new ParseField("time_field"); public static final ParseField TIME_FORMAT_FIELD = new ParseField("time_format"); public static final ParseField FIELD_DELIMITER_FIELD = new ParseField("field_delimiter"); public static final ParseField QUOTE_CHARACTER_FIELD = new ParseField("quote_character"); /** * Special time format string for epoch times (seconds) */ public static final String EPOCH = "epoch"; /** * Special time format string for epoch times (milli-seconds) */ public static final String EPOCH_MS = "epoch_ms"; /** * By default autodetect expects the timestamp in a field with this name */ public static final String DEFAULT_TIME_FIELD = "time"; /** * The default field delimiter expected by the native autodetect * program. */ public static final char DEFAULT_DELIMITER = '\t'; /** * Csv data must have this line ending */ public static final char LINE_ENDING = '\n'; /** * The default quote character used to escape text in * delineated data formats */ public static final char DEFAULT_QUOTE_CHAR = '"'; private final DataFormat dataFormat; private final String timeFieldName; private final String timeFormat; private final Character fieldDelimiter; private final Character quoteCharacter; // These parsers follow the pattern that metadata is parsed leniently (to allow for enhancements), whilst config is parsed strictly public static final ObjectParser LENIENT_PARSER = createParser(true); public static final ObjectParser STRICT_PARSER = createParser(false); private static ObjectParser createParser(boolean ignoreUnknownFields) { ObjectParser parser = new ObjectParser<>(DATA_DESCRIPTION_FIELD.getPreferredName(), ignoreUnknownFields, Builder::new); parser.declareString(Builder::setFormat, FORMAT_FIELD); parser.declareString(Builder::setTimeField, TIME_FIELD_NAME_FIELD); parser.declareString(Builder::setTimeFormat, TIME_FORMAT_FIELD); parser.declareString(Builder::setFieldDelimiter, DataDescription::extractChar, FIELD_DELIMITER_FIELD); parser.declareString(Builder::setQuoteCharacter, DataDescription::extractChar, QUOTE_CHARACTER_FIELD); return parser; } public DataDescription(DataFormat dataFormat, String timeFieldName, String timeFormat, Character fieldDelimiter, Character quoteCharacter) { this.dataFormat = dataFormat; this.timeFieldName = timeFieldName; this.timeFormat = timeFormat; this.fieldDelimiter = fieldDelimiter; this.quoteCharacter = quoteCharacter; } public DataDescription(StreamInput in) throws IOException { dataFormat = DataFormat.readFromStream(in); timeFieldName = in.readString(); timeFormat = in.readString(); fieldDelimiter = in.readBoolean() ? (char) in.read() : null; quoteCharacter = in.readBoolean() ? (char) in.read() : null; } @Override public void writeTo(StreamOutput out) throws IOException { dataFormat.writeTo(out); out.writeString(timeFieldName); out.writeString(timeFormat); if (fieldDelimiter != null) { out.writeBoolean(true); out.write(fieldDelimiter); } else { out.writeBoolean(false); } if (quoteCharacter != null) { out.writeBoolean(true); out.write(quoteCharacter); } else { out.writeBoolean(false); } } @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); if (dataFormat != DataFormat.XCONTENT) { builder.field(FORMAT_FIELD.getPreferredName(), dataFormat); } builder.field(TIME_FIELD_NAME_FIELD.getPreferredName(), timeFieldName); builder.field(TIME_FORMAT_FIELD.getPreferredName(), timeFormat); if (fieldDelimiter != null) { builder.field(FIELD_DELIMITER_FIELD.getPreferredName(), String.valueOf(fieldDelimiter)); } if (quoteCharacter != null) { builder.field(QUOTE_CHARACTER_FIELD.getPreferredName(), String.valueOf(quoteCharacter)); } builder.endObject(); return builder; } /** * The format of the data to be processed. * Defaults to {@link DataDescription.DataFormat#XCONTENT} * * @return The data format */ public DataFormat getFormat() { return dataFormat; } /** * The name of the field containing the timestamp * * @return A String if set or null */ public String getTimeField() { return timeFieldName; } /** * Either {@value #EPOCH}, {@value #EPOCH_MS} or a SimpleDateTime format string. * If not set (is null or an empty string) or set to * {@value #EPOCH_MS} (the default) then the date is assumed to be in * milliseconds from the epoch. * * @return A String if set or null */ public String getTimeFormat() { return timeFormat; } /** * If the data is in a delineated format with a header e.g. csv or tsv * this is the delimiter character used. This is only applicable if * {@linkplain #getFormat()} is {@link DataDescription.DataFormat#DELIMITED}. * The default value for delimited format is {@value #DEFAULT_DELIMITER}. * * @return A char */ public Character getFieldDelimiter() { return fieldDelimiter; } /** * The quote character used in delineated formats. * The default value for delimited format is {@value #DEFAULT_QUOTE_CHAR}. * * @return The delineated format quote character */ public Character getQuoteCharacter() { return quoteCharacter; } /** * Returns true if the data described by this object needs * transforming before processing by autodetect. * A transformation must be applied if either a timeformat is * not in seconds since the epoch or the data is in Json format. * * @return True if the data should be transformed. */ public boolean transform() { return dataFormat == DataFormat.XCONTENT || isTransformTime(); } /** * Return true if the time is in a format that needs transforming. * Anytime format this isn't {@value #EPOCH} or null * needs transforming. * * @return True if the time field needs to be transformed. */ public boolean isTransformTime() { return timeFormat != null && EPOCH.equals(timeFormat) == false; } /** * Return true if the time format is {@value #EPOCH_MS} * * @return True if the date is in milli-seconds since the epoch. */ public boolean isEpochMs() { return EPOCH_MS.equals(timeFormat); } private static Character extractChar(String charStr) { if (charStr.length() != 1) { throw new IllegalArgumentException("String must be a single character, found [" + charStr + "]"); } return charStr.charAt(0); } /** * Overridden equality test */ @Override public boolean equals(Object other) { if (this == other) { return true; } if (other instanceof DataDescription == false) { return false; } DataDescription that = (DataDescription) other; return this.dataFormat == that.dataFormat && Objects.equals(this.quoteCharacter, that.quoteCharacter) && Objects.equals(this.timeFieldName, that.timeFieldName) && Objects.equals(this.timeFormat, that.timeFormat) && Objects.equals(this.fieldDelimiter, that.fieldDelimiter); } @Override public int hashCode() { return Objects.hash(dataFormat, quoteCharacter, timeFieldName, timeFormat, fieldDelimiter); } public static class Builder { private DataFormat dataFormat = DataFormat.XCONTENT; private String timeFieldName = DEFAULT_TIME_FIELD; private String timeFormat = EPOCH_MS; private Character fieldDelimiter; private Character quoteCharacter; public Builder setFormat(DataFormat format) { dataFormat = ExceptionsHelper.requireNonNull(format, FORMAT_FIELD.getPreferredName() + " must not be null"); return this; } private Builder setFormat(String format) { setFormat(DataFormat.forString(format)); return this; } public Builder setTimeField(String fieldName) { timeFieldName = ExceptionsHelper.requireNonNull(fieldName, TIME_FIELD_NAME_FIELD.getPreferredName() + " must not be null"); return this; } public Builder setTimeFormat(String format) { ExceptionsHelper.requireNonNull(format, TIME_FORMAT_FIELD.getPreferredName() + " must not be null"); switch (format) { case EPOCH: case EPOCH_MS: break; default: try { DateTimeFormatterTimestampConverter.ofPattern(format, ZoneOffset.UTC); } catch (IllegalArgumentException e) { throw ExceptionsHelper.badRequestException( Messages.getMessage(Messages.JOB_CONFIG_INVALID_TIMEFORMAT, format), e.getCause()); } } timeFormat = format; return this; } public Builder setFieldDelimiter(Character delimiter) { fieldDelimiter = delimiter; return this; } public Builder setQuoteCharacter(Character value) { quoteCharacter = value; return this; } public DataDescription build() { if (dataFormat == DataFormat.DELIMITED) { if (fieldDelimiter == null) { fieldDelimiter = DEFAULT_DELIMITER; } if (quoteCharacter == null) { quoteCharacter = DEFAULT_QUOTE_CHAR; } } return new DataDescription(dataFormat, timeFieldName, timeFormat, fieldDelimiter, quoteCharacter); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy