org.opensearch.dissect.DissectParser Maven / Gradle / Ivy

Go to download
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.dissect;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
 * Splits (dissects) a string into its parts based on a pattern.
A dissect pattern is composed of a set of keys and delimiters.
 * For example the dissect pattern: 
%{a} %{b},%{c} has 3 keys (a,b,c) and two delimiters (space and comma). This pattern will
 * match a string of the form: foo bar,baz
 and will result a key/value pairing of a=foo, b=bar, and c=baz.
 * Matches are all or nothing. For example, the same pattern will NOT match 
foo bar baz since all of the delimiters did not
 * match. (the comma did not match)
 * Dissect patterns can optionally have modifiers. These modifiers instruct the parser to change it's behavior. For example the
 * dissect pattern of 
%{a},%{b}:%{c}
 would not match foo,bar,baz since there the colon never matches.
 * Modifiers appear to the left or the right of the key name. The supported modifiers are:
 * 

 * {@code ->} Instructs the parser to ignore repeating delimiters to the right of the key. Example:  * pattern: {@code %{a->} %{b} %{c}}
 * string: {@code foo         bar baz}
 * result: {@code a=foo, b=bar, c=baz}
 * 
 * {@code +} Instructs the parser to appends this key's value to value of prior key with the same name.
 * Example:  * pattern: {@code %{a} %{+a} %{+a}}
 * string: {@code foo bar baz}
 * result: {@code a=foobarbaz}
 * 
 * {@code /} Instructs the parser to appends this key's value to value of a key based based on the order specified after the
 * {@code /}. Requires the {@code +} modifier to also be present in the key. Example:  * pattern: {@code %{a} %{+a/2} %{+a/1}}
 * string: {@code foo bar baz}
 * result: {@code a=foobazbar}
 * 
 * 
 * {@code *} Instructs the parser to ignore the name of this key, instead use the value of key as the key name.
 * Requires another key with the same name and the {@code &} modifier to be the value. Example:  * pattern: {@code %{*a} %{b} %{&a}}
 * string: {@code foo bar baz}
 * result: {@code foo=baz, b=bar}
 * 
 * {@code &} Instructs the parser to ignore this key and place the matched value to a key of the same name with the {@code *} modifier.
 * Requires another key with the same name and the {@code *} modifier.
 * Example:  * pattern: {@code %{*a} %{b} %{&a}}
 * string: {@code foo bar baz}
 * result: {@code foo=baz, b=bar}
 * 
 * {@code ?} Instructs the parser to ignore this key. The key name exists only for the purpose of human readability. Example
 *  *  pattern: {@code %{a} %{?skipme} %{c}}
 *  string: {@code foo bar baz}
 *  result: {@code a=foo, c=baz}
 * 
 * 
 * Empty key names patterns are also supported. They behave just like the {@code ?} modifier, except the name is not required.
 * The result will simply be ignored. Example
 * 
 * pattern: {@code %{a} %{} %{c}}
 * string: {@code foo bar baz}
 * result: {@code a=foo, c=baz}
 * 

 * 
 * Inspired by the Logstash Dissect Filter by Guy Boertje
 */
public final class DissectParser {
    private static final Pattern LEADING_DELIMITER_PATTERN = Pattern.compile("^(.*?)%");
    private static final Pattern KEY_DELIMITER_FIELD_PATTERN = Pattern.compile("%\\{([^}]*?)}([^%]*)", Pattern.DOTALL);
    private static final EnumSet ASSOCIATE_MODIFIERS = EnumSet.of(
        DissectKey.Modifier.FIELD_NAME,
        DissectKey.Modifier.FIELD_VALUE
    );
    private static final EnumSet APPEND_MODIFIERS = EnumSet.of(
        DissectKey.Modifier.APPEND,
        DissectKey.Modifier.APPEND_WITH_ORDER
    );
    private static final Function KEY_NAME = val -> val.getKey().getName();
    private final List matchPairs;
    private final String pattern;
    private String leadingDelimiter = "";
    private final int maxMatches;
    private final int maxResults;
    private final int appendCount;
    private final int referenceCount;
    private final String appendSeparator;

    public DissectParser(String pattern, String appendSeparator) {
        this.pattern = pattern;
        this.appendSeparator = appendSeparator == null ? "" : appendSeparator;
        Matcher matcher = LEADING_DELIMITER_PATTERN.matcher(pattern);
        while (matcher.find()) {
            leadingDelimiter = matcher.group(1);
        }
        List matchPairs = new ArrayList<>();
        matcher = KEY_DELIMITER_FIELD_PATTERN.matcher(pattern.substring(leadingDelimiter.length()));
        while (matcher.find()) {
            DissectKey key = new DissectKey(matcher.group(1));
            String delimiter = matcher.group(2);
            matchPairs.add(new DissectPair(key, delimiter));
        }
        this.maxMatches = matchPairs.size();
        this.maxResults = Long.valueOf(
            matchPairs.stream().filter(dissectPair -> !dissectPair.getKey().skip()).map(KEY_NAME).distinct().count()
        ).intValue();
        if (this.maxMatches == 0 || maxResults == 0) {
            throw new DissectException.PatternParse(pattern, "Unable to find any keys or delimiters.");
        }
        // append validation - look through all of the keys to see if there are any keys that need to participate in an append operation
        // but don't have the '+' defined
        Set appendKeyNames = matchPairs.stream()
            .filter(dissectPair -> APPEND_MODIFIERS.contains(dissectPair.getKey().getModifier()))
            .map(KEY_NAME)
            .distinct()
            .collect(Collectors.toSet());
        if (appendKeyNames.size() > 0) {
            List modifiedMatchPairs = new ArrayList<>(matchPairs.size());
            for (DissectPair p : matchPairs) {
                if (p.getKey().getModifier().equals(DissectKey.Modifier.NONE) && appendKeyNames.contains(p.getKey().getName())) {
                    modifiedMatchPairs.add(new DissectPair(new DissectKey(p.getKey(), DissectKey.Modifier.APPEND), p.getDelimiter()));
                } else {
                    modifiedMatchPairs.add(p);
                }
            }
            matchPairs = modifiedMatchPairs;
        }
        appendCount = appendKeyNames.size();

        // reference validation - ensure that '*' and '&' come in pairs
        Map> referenceGroupings = matchPairs.stream()
            .filter(dissectPair -> ASSOCIATE_MODIFIERS.contains(dissectPair.getKey().getModifier()))
            .collect(Collectors.groupingBy(KEY_NAME));
        for (Map.Entry> entry : referenceGroupings.entrySet()) {
            if (entry.getValue().size() != 2) {
                throw new DissectException.PatternParse(
                    pattern,
                    "Found invalid key/reference associations: '"
                        + entry.getValue().stream().map(KEY_NAME).collect(Collectors.joining(","))
                        + "' Please ensure each '*' is matched with a matching '&"
                );
            }
        }

        referenceCount = referenceGroupings.size() * 2;
        this.matchPairs = Collections.unmodifiableList(matchPairs);
    }

    /**
     * 
Entry point to dissect a string into it's parts.
     *
     * @param inputString The string to dissect
     * @return the key/value Map of the results
     * @throws DissectException if unable to dissect a pair into it's parts.
     */
    public Map parse(String inputString) {
        /*

          This implements a naive string matching algorithm. The string is walked left to right, comparing each byte against
          another string's bytes looking for matches. If the bytes match, then a second cursor looks ahead to see if all the bytes
          of the other string matches. If they all match, record it and advances the primary cursor to the match point. If it can not match
          all of the bytes then progress the main cursor. Repeat till the end of the input string. Since the string being searching for
          (the delimiter) is generally small and rare the naive approach is efficient.

          In this case the string that is walked is the input string, and the string being searched for is the current delimiter.
          For example for a dissect pattern of {@code %{a},%{b}:%{c}} the delimiters (comma then colon) are searched for in the
          input string. At class construction the list of keys+delimiters are found (dissectPairs), which allows the use of that ordered
          list to know which delimiter to use for the search. The delimiters is progressed once the current delimiter is matched.

          There are two special cases that requires additional parsing beyond the standard naive algorithm. Consecutive delimiters should
          results in a empty matches unless the {@code ->} is provided. For example given the dissect pattern of
          {@code %{a},%{b},%{c},%{d}} and input string of {@code foo,,,} the match should be successful with empty values for b,c and d.
          However, if the key modifier {@code ->}, is present it will simply skip over any delimiters just to the right of the key
          without assigning any values. For example {@code %{a->},{%b}} will match the input string of {@code foo,,,,,,bar} with a=foo and
          b=bar.
         */
        DissectMatch dissectMatch = new DissectMatch(appendSeparator, maxMatches, maxResults, appendCount, referenceCount);
        Iterator it = matchPairs.iterator();
        // ensure leading delimiter matches
        if (inputString != null
            && inputString.length() > leadingDelimiter.length()
            && leadingDelimiter.equals(inputString.substring(0, leadingDelimiter.length()))) {
            byte[] input = inputString.getBytes(StandardCharsets.UTF_8);
            // grab the first key/delimiter pair
            DissectPair dissectPair = it.next();
            DissectKey key = dissectPair.getKey();
            byte[] delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
            // start dissection after the first delimiter
            int i = leadingDelimiter.length();
            int valueStart = i;
            int lookAheadMatches;
            // start walking the input string byte by byte, look ahead for matches where needed
            // if a match is found jump forward to the end of the match
            while (i < input.length) {
                // start is only used to record the value of i
                int start = i;

                lookAheadMatches = 0;
                // potential match between delimiter and input string
                if (delimiter.length > 0 && input[i] == delimiter[0]) {
                    // look ahead to see if the entire delimiter matches the input string
                    for (int j = 0; j < delimiter.length; j++) {
                        if (i + j < input.length && input[i + j] == delimiter[j]) {
                            lookAheadMatches++;
                        }
                    }
                    // found a full delimiter match
                    if (lookAheadMatches == delimiter.length) {
                        // record the key/value tuple
                        byte[] value = Arrays.copyOfRange(input, valueStart, i);
                        dissectMatch.add(key, new String(value, StandardCharsets.UTF_8));
                        // jump to the end of the match
                        i += lookAheadMatches;
                        // look for consecutive delimiters (e.g. a,,,,d,e)
                        while (i < input.length) {
                            lookAheadMatches = 0;
                            for (int j = 0; j < delimiter.length; j++) {
                                if (i + j < input.length && input[i + j] == delimiter[j]) {
                                    lookAheadMatches++;
                                }
                            }
                            // found consecutive delimiters
                            if (lookAheadMatches == delimiter.length) {
                                // jump to the end of the match
                                i += lookAheadMatches;
                                if (!key.skipRightPadding()) {
                                    // progress the keys/delimiter if possible
                                    if (!it.hasNext()) {
                                        break; // the while loop
                                    }
                                    dissectPair = it.next();
                                    key = dissectPair.getKey();
                                    // add the key with an empty value for the empty delimiter
                                    dissectMatch.add(key, "");
                                }
                            } else {
                                break; // the while loop
                            }
                        }
                        // progress the keys/delimiter if possible
                        if (!it.hasNext()) {
                            break; // the for loop
                        }
                        dissectPair = it.next();
                        key = dissectPair.getKey();
                        delimiter = dissectPair.getDelimiter().getBytes(StandardCharsets.UTF_8);
                        // i is always one byte after the last found delimiter, aka the start of the next value
                        valueStart = i;
                    } else {
                        i++;
                    }
                } else {
                    i++;
                }
                // i should change anyway
                assert (i != start);
            }
            // the last key, grab the rest of the input (unless consecutive delimiters already grabbed the last key)
            // and there is no trailing delimiter
            if (!dissectMatch.fullyMatched() && delimiter.length == 0) {
                byte[] value = Arrays.copyOfRange(input, valueStart, input.length);
                String valueString = new String(value, StandardCharsets.UTF_8);
                dissectMatch.add(key, valueString);
            }
        }
        Map results = dissectMatch.getResults();

        if (!dissectMatch.isValid(results)) {
            throw new DissectException.FindMatch(pattern, inputString);
        }
        return results;
    }

    /**
     * A tuple class to hold the dissect key and delimiter
     */
    private class DissectPair {

        private final DissectKey key;
        private final String delimiter;

        private DissectPair(DissectKey key, String delimiter) {
            this.key = key;
            this.delimiter = delimiter;
        }

        private DissectKey getKey() {
            return key;
        }

        private String getDelimiter() {
            return delimiter;
        }
    }

}