
org.opensearch.dissect.DissectParser Maven / Gradle / Ivy
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.dissect;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Splits (dissects) a string into its parts based on a pattern.
A dissect pattern is composed of a set of keys and delimiters.
* For example the dissect pattern:
%{a} %{b},%{c}
has 3 keys (a,b,c) and two delimiters (space and comma). This pattern will
* match a string of the form: foo bar,baz
and will result a key/value pairing of a=foo, b=bar, and c=baz.
* Matches are all or nothing. For example, the same pattern will NOT match
foo bar baz
since all of the delimiters did not
* match. (the comma did not match)
* Dissect patterns can optionally have modifiers. These modifiers instruct the parser to change it's behavior. For example the
* dissect pattern of
%{a},%{b}:%{c}
would not match foo,bar,baz
since there the colon never matches.
* Modifiers appear to the left or the right of the key name. The supported modifiers are:
*
* - {@code ->} Instructs the parser to ignore repeating delimiters to the right of the key. Example:
* pattern: {@code %{a->} %{b} %{c}}
* string: {@code foo bar baz}
* result: {@code a=foo, b=bar, c=baz}
*
* - {@code +} Instructs the parser to appends this key's value to value of prior key with the same name.
* Example:
* pattern: {@code %{a} %{+a} %{+a}}
* string: {@code foo bar baz}
* result: {@code a=foobarbaz}
*
* - {@code /} Instructs the parser to appends this key's value to value of a key based based on the order specified after the
* {@code /}. Requires the {@code +} modifier to also be present in the key. Example:
* pattern: {@code %{a} %{+a/2} %{+a/1}}
* string: {@code foo bar baz}
* result: {@code a=foobazbar}
*
*
* - {@code *} Instructs the parser to ignore the name of this key, instead use the value of key as the key name.
* Requires another key with the same name and the {@code &} modifier to be the value. Example:
* pattern: {@code %{*a} %{b} %{&a}}
* string: {@code foo bar baz}
* result: {@code foo=baz, b=bar}
*
* - {@code &} Instructs the parser to ignore this key and place the matched value to a key of the same name with the {@code *} modifier.
* Requires another key with the same name and the {@code *} modifier.
* Example:
* pattern: {@code %{*a} %{b} %{&a}}
* string: {@code foo bar baz}
* result: {@code foo=baz, b=bar}
*
* - {@code ?} Instructs the parser to ignore this key. The key name exists only for the purpose of human readability. Example
*
* pattern: {@code %{a} %{?skipme} %{c}}
* string: {@code foo bar baz}
* result: {@code a=foo, c=baz}
*
*
* Empty key names patterns are also supported. They behave just like the {@code ?} modifier, except the name is not required.
* The result will simply be ignored. Example
*
* pattern: {@code %{a} %{} %{c}}
* string: {@code foo bar baz}
* result: {@code a=foo, c=baz}
*
*
* Inspired by the Logstash Dissect Filter by Guy Boertje
*/
public final class DissectParser {
private static final Pattern LEADING_DELIMITER_PATTERN = Pattern.compile("^(.*?)%");
private static final Pattern KEY_DELIMITER_FIELD_PATTERN = Pattern.compile("%\\{([^}]*?)}([^%]*)", Pattern.DOTALL);
private static final EnumSet ASSOCIATE_MODIFIERS = EnumSet.of(
DissectKey.Modifier.FIELD_NAME,
DissectKey.Modifier.FIELD_VALUE
);
private static final EnumSet APPEND_MODIFIERS = EnumSet.of(
DissectKey.Modifier.APPEND,
DissectKey.Modifier.APPEND_WITH_ORDER
);
private static final Function KEY_NAME = val -> val.getKey().getName();
private final List matchPairs;
private final String pattern;
private String leadingDelimiter = "";
private final int maxMatches;
private final int maxResults;
private final int appendCount;
private final int referenceCount;
private final String appendSeparator;
public DissectParser(String pattern, String appendSeparator) {
this.pattern = pattern;
this.appendSeparator = appendSeparator == null ? "" : appendSeparator;
Matcher matcher = LEADING_DELIMITER_PATTERN.matcher(pattern);
while (matcher.find()) {
leadingDelimiter = matcher.group(1);
}
List matchPairs = new ArrayList<>();
matcher = KEY_DELIMITER_FIELD_PATTERN.matcher(pattern.substring(leadingDelimiter.length()));
while (matcher.find()) {
DissectKey key = new DissectKey(matcher.group(1));
String delimiter = matcher.group(2);
matchPairs.add(new DissectPair(key, delimiter));
}
this.maxMatches = matchPairs.size();
this.maxResults = Long.valueOf(
matchPairs.stream().filter(dissectPair -> !dissectPair.getKey().skip()).map(KEY_NAME).distinct().count()
).intValue();
if (this.maxMatches == 0 || maxResults == 0) {
throw new DissectException.PatternParse(pattern, "Unable to find any keys or delimiters.");
}
// append validation - look through all of the keys to see if there are any keys that need to participate in an append operation
// but don't have the '+' defined
Set appendKeyNames = matchPairs.stream()
.filter(dissectPair -> APPEND_MODIFIERS.contains(dissectPair.getKey().getModifier()))
.map(KEY_NAME)
.distinct()
.collect(Collectors.toSet());
if (appendKeyNames.size() > 0) {
List modifiedMatchPairs = new ArrayList<>(matchPairs.size());
for (DissectPair p : matchPairs) {
if (p.getKey().getModifier().equals(DissectKey.Modifier.NONE) && appendKeyNames.contains(p.getKey().getName())) {
modifiedMatchPairs.add(new DissectPair(new DissectKey(p.getKey(), DissectKey.Modifier.APPEND), p.getDelimiter()));
} else {
modifiedMatchPairs.add(p);
}
}
matchPairs = modifiedMatchPairs;
}
appendCount = appendKeyNames.size();
// reference validation - ensure that '*' and '&' come in pairs
Map> referenceGroupings = matchPairs.stream()
.filter(dissectPair -> ASSOCIATE_MODIFIERS.contains(dissectPair.getKey().getModifier()))
.collect(Collectors.groupingBy(KEY_NAME));
for (Map.Entry> entry : referenceGroupings.entrySet()) {
if (entry.getValue().size() != 2) {
throw new DissectException.PatternParse(
pattern,
"Found invalid key/reference associations: '"
+ entry.getValue().stream().map(KEY_NAME).collect(Collectors.joining(","))
+ "' Please ensure each '*' is matched with a matching '&"
);
}
}
referenceCount = referenceGroupings.size() * 2;
this.matchPairs = Collections.unmodifiableList(matchPairs);
}
/**
* Entry point to dissect a string into it's parts.
*
* @param inputString The string to dissect
* @return the key/value Map of the results
* @throws DissectException if unable to dissect a pair into it's parts.
*/
public Map parse(String inputString) {
/**
*
* This implements a naive string matching algorithm. The string is walked left to right, comparing each byte against
* another string's bytes looking for matches. If the bytes match, then a second cursor looks ahead to see if all the bytes
* of the other string matches. If they all match, record it and advances the primary cursor to the match point. If it can not match
* all of the bytes then progress the main cursor. Repeat till the end of the input string. Since the string being searching for
* (the delimiter) is generally small and rare the naive approach is efficient.
*
* In this case the string that is walked is the input string, and the string being searched for is the current delimiter.
* For example for a dissect pattern of {@code %{a},%{b}:%{c}} the delimiters (comma then colon) are searched for in the
* input string. At class construction the list of keys+delimiters are found (dissectPairs), which allows the use of that ordered
* list to know which delimiter to use for the search. The delimiters is progressed once the current delimiter is matched.
*
* There are two special cases that requires additional parsing beyond the standard naive algorithm. Consecutive delimiters should
* results in a empty matches unless the {@code ->} is provided. For example given the dissect pattern of
* {@code %{a},%{b},%{c},%{d}} and input string of {@code foo,,,} the match should be successful with empty values for b,c and d.
* However, if the key modifier {@code ->}, is present it will simply skip over any delimiters just to the right of the key
* without assigning any values. For example {@code %{a->},{%b}} will match the input string of {@code foo,,,,,,bar} with a=foo and
* b=bar.
*
*/
DissectMatch dissectMatch = new DissectMatch(appendSeparator, maxMatches, maxResults, appendCount, referenceCount);
Iterator it = matchPairs.iterator();
// ensure leading delimiter matches
if (inputString != null
&& inputString.length() > leadingDelimiter.length()
&& leadingDelimiter.equals(inputString.substring(0, leadingDelimiter.length()))) {
byte[] input = inputString.getBytes(StandardCharsets.UTF_8);
// grab the first key/delimiter pair
DissectPair dissectPair = it.next();
DissectKey key = dissectPair.getKey();
byte[] delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
// start dissection after the first delimiter
int i = leadingDelimiter.length();
int valueStart = i;
int lookAheadMatches;
// start walking the input string byte by byte, look ahead for matches where needed
// if a match is found jump forward to the end of the match
for (; i < input.length; i++) {
lookAheadMatches = 0;
// potential match between delimiter and input string
if (delimiter.length > 0 && input[i] == delimiter[0]) {
// look ahead to see if the entire delimiter matches the input string
for (int j = 0; j < delimiter.length; j++) {
if (i + j < input.length && input[i + j] == delimiter[j]) {
lookAheadMatches++;
}
}
// found a full delimiter match
if (lookAheadMatches == delimiter.length) {
// record the key/value tuple
byte[] value = Arrays.copyOfRange(input, valueStart, i);
dissectMatch.add(key, new String(value, StandardCharsets.UTF_8));
// jump to the end of the match
i += lookAheadMatches;
// look for consecutive delimiters (e.g. a,,,,d,e)
while (i < input.length) {
lookAheadMatches = 0;
for (int j = 0; j < delimiter.length; j++) {
if (i + j < input.length && input[i + j] == delimiter[j]) {
lookAheadMatches++;
}
}
// found consecutive delimiters
if (lookAheadMatches == delimiter.length) {
// jump to the end of the match
i += lookAheadMatches;
if (!key.skipRightPadding()) {
// progress the keys/delimiter if possible
if (!it.hasNext()) {
break; // the while loop
}
dissectPair = it.next();
key = dissectPair.getKey();
// add the key with an empty value for the empty delimiter
dissectMatch.add(key, "");
}
} else {
break; // the while loop
}
}
// progress the keys/delimiter if possible
if (!it.hasNext()) {
break; // the for loop
}
dissectPair = it.next();
key = dissectPair.getKey();
delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
// i is always one byte after the last found delimiter, aka the start of the next value
valueStart = i;
}
}
}
// the last key, grab the rest of the input (unless consecutive delimiters already grabbed the last key)
// and there is no trailing delimiter
if (!dissectMatch.fullyMatched() && delimiter.length == 0) {
byte[] value = Arrays.copyOfRange(input, valueStart, input.length);
String valueString = new String(value, StandardCharsets.UTF_8);
dissectMatch.add(key, valueString);
}
}
Map results = dissectMatch.getResults();
if (!dissectMatch.isValid(results)) {
throw new DissectException.FindMatch(pattern, inputString);
}
return results;
}
/**
* A tuple class to hold the dissect key and delimiter
*/
private class DissectPair {
private final DissectKey key;
private final String delimiter;
private DissectPair(DissectKey key, String delimiter) {
this.key = key;
this.delimiter = delimiter;
}
private DissectKey getKey() {
return key;
}
private String getDelimiter() {
return delimiter;
}
}
}