All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.grok.Grok Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.grok;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;

import org.jcodings.specific.UTF8Encoding;
import org.joni.Matcher;
import org.joni.NameEntry;
import org.joni.Option;
import org.joni.Regex;
import org.joni.Region;
import org.joni.Syntax;
import org.joni.exception.ValueException;

import static java.util.Collections.unmodifiableList;

public final class Grok {
    /**
     * Patterns built in to the grok library.
     */
    public static final Map BUILTIN_PATTERNS = loadBuiltinPatterns();

    private static final String NAME_GROUP = "name";
    private static final String SUBNAME_GROUP = "subname";
    private static final String PATTERN_GROUP = "pattern";
    private static final String DEFINITION_GROUP = "definition";
    private static final String GROK_PATTERN = "%\\{"
        + "(?"
        + "(?[A-z0-9]+)"
        + "(?::(?[[:alnum:]@\\[\\]_:.-]+))?"
        + ")"
        + "(?:=(?"
        + "(?:[^{}]+|\\.+)+"
        + ")"
        + ")?"
        + "\\}";
    private static final Regex GROK_PATTERN_REGEX = new Regex(
        GROK_PATTERN.getBytes(StandardCharsets.UTF_8),
        0,
        GROK_PATTERN.getBytes(StandardCharsets.UTF_8).length,
        Option.NONE,
        UTF8Encoding.INSTANCE,
        Syntax.DEFAULT
    );
    private static final int MAX_PATTERN_DEPTH_SIZE = 500;

    private static final int MAX_TO_REGEX_ITERATIONS = 100_000; // sanity limit

    private final Map patternBank;
    private final boolean namedCaptures;
    private final Regex compiledExpression;
    private final MatcherWatchdog matcherWatchdog;
    private final List captureConfig;

    public Grok(Map patternBank, String grokPattern, Consumer logCallBack) {
        this(patternBank, grokPattern, true, MatcherWatchdog.noop(), logCallBack);
    }

    public Grok(Map patternBank, String grokPattern, MatcherWatchdog matcherWatchdog, Consumer logCallBack) {
        this(patternBank, grokPattern, true, matcherWatchdog, logCallBack);
    }

    Grok(Map patternBank, String grokPattern, boolean namedCaptures, Consumer logCallBack) {
        this(patternBank, grokPattern, namedCaptures, MatcherWatchdog.noop(), logCallBack);
    }

    private Grok(
        Map patternBank,
        String grokPattern,
        boolean namedCaptures,
        MatcherWatchdog matcherWatchdog,
        Consumer logCallBack
    ) {
        this.patternBank = patternBank;
        this.namedCaptures = namedCaptures;
        this.matcherWatchdog = matcherWatchdog;

        validatePatternBank();

        String expression = toRegex(grokPattern);
        byte[] expressionBytes = expression.getBytes(StandardCharsets.UTF_8);
        this.compiledExpression = new Regex(
            expressionBytes,
            0,
            expressionBytes.length,
            Option.DEFAULT,
            UTF8Encoding.INSTANCE,
            logCallBack::accept
        );

        List captureConfig = new ArrayList<>();
        for (Iterator entry = compiledExpression.namedBackrefIterator(); entry.hasNext();) {
            captureConfig.add(new GrokCaptureConfig(entry.next()));
        }
        this.captureConfig = unmodifiableList(captureConfig);
    }

    /**
     * Entry point to recursively validate the pattern bank for circular dependencies and malformed URLs
     * via depth-first traversal. This implementation does not include memoization.
     */
    private void validatePatternBank() {
        for (String patternName : patternBank.keySet()) {
            validatePatternBank(patternName);
        }
    }

    /**
     * Checks whether patterns reference each other in a circular manner and, if so, fail with an exception.
     * Also checks for malformed pattern definitions and fails with an exception.
     * 

* In a pattern, anything between %{ and } or : is considered * a reference to another named pattern. This method will navigate to all these named patterns and * check for a circular reference. */ private void validatePatternBank(String initialPatternName) { Deque stack = new ArrayDeque<>(); Set visitedPatterns = new HashSet<>(); Map> pathMap = new HashMap<>(); List initialPath = new ArrayList<>(); initialPath.add(initialPatternName); pathMap.put(initialPatternName, initialPath); stack.push(new Frame(initialPatternName, initialPath, 0)); while (!stack.isEmpty()) { Frame frame = stack.peek(); String patternName = frame.patternName; List path = frame.path; int startIndex = frame.startIndex; String pattern = patternBank.get(patternName); if (visitedPatterns.contains(patternName)) { stack.pop(); continue; } visitedPatterns.add(patternName); boolean foundDependency = false; for (int i = startIndex; i < pattern.length(); i++) { if (pattern.startsWith("%{", i)) { int begin = i + 2; int syntaxEndIndex = pattern.indexOf('}', begin); if (syntaxEndIndex == -1) { throw new IllegalArgumentException("Malformed pattern [" + patternName + "][" + pattern + "]"); } int semanticNameIndex = pattern.indexOf(':', begin); int end = semanticNameIndex == -1 ? syntaxEndIndex : Math.min(syntaxEndIndex, semanticNameIndex); String dependsOnPattern = pattern.substring(begin, end); if (dependsOnPattern.equals(patternName)) { throwExceptionForCircularReference(patternName, pattern); } if (pathMap.containsKey(dependsOnPattern)) { throwExceptionForCircularReference(patternName, pattern, dependsOnPattern, path.subList(0, path.size() - 1)); } List newPath = new ArrayList<>(path); newPath.add(dependsOnPattern); pathMap.put(dependsOnPattern, newPath); stack.push(new Frame(dependsOnPattern, newPath, 0)); frame.startIndex = i + 1; foundDependency = true; break; } } if (!foundDependency) { pathMap.remove(patternName); stack.pop(); } if (stack.size() > MAX_PATTERN_DEPTH_SIZE) { throw new IllegalArgumentException("Pattern references exceeded maximum depth of " + MAX_PATTERN_DEPTH_SIZE); } } } private static class Frame { String patternName; List path; int startIndex; Frame(String patternName, List path, int startIndex) { this.patternName = patternName; this.path = path; this.startIndex = startIndex; } } private static void throwExceptionForCircularReference(String patternName, String pattern) { throwExceptionForCircularReference(patternName, pattern, null, null); } private static void throwExceptionForCircularReference( String patternName, String pattern, String originPatternName, List path ) { StringBuilder message = new StringBuilder("circular reference in pattern ["); message.append(patternName).append("][").append(pattern).append("]"); if (originPatternName != null) { message.append(" back to pattern [").append(originPatternName).append("]"); } if (path != null && path.size() > 1) { message.append(" via patterns [").append(String.join("=>", path)).append("]"); } throw new IllegalArgumentException(message.toString()); } private String groupMatch(String name, Region region, String pattern) { try { int number = GROK_PATTERN_REGEX.nameToBackrefNumber( name.getBytes(StandardCharsets.UTF_8), 0, name.getBytes(StandardCharsets.UTF_8).length, region ); int begin = region.getBeg(number); int end = region.getEnd(number); return new String(pattern.getBytes(StandardCharsets.UTF_8), begin, end - begin, StandardCharsets.UTF_8); } catch (StringIndexOutOfBoundsException | ValueException e) { return null; } } /** * converts a grok expression into a named regex expression * * @return named regex expression */ protected String toRegex(String grokPattern) { StringBuilder res = new StringBuilder(); for (int i = 0; i < MAX_TO_REGEX_ITERATIONS; i++) { byte[] grokPatternBytes = grokPattern.getBytes(StandardCharsets.UTF_8); Matcher matcher = GROK_PATTERN_REGEX.matcher(grokPatternBytes); int result; try { matcherWatchdog.register(matcher); result = matcher.search(0, grokPatternBytes.length, Option.NONE); } finally { matcherWatchdog.unregister(matcher); } if (result < 0) { return res.append(grokPattern).toString(); } Region region = matcher.getEagerRegion(); String namedPatternRef = groupMatch(NAME_GROUP, region, grokPattern); String subName = groupMatch(SUBNAME_GROUP, region, grokPattern); // TODO(tal): Support definitions @SuppressWarnings("unused") String definition = groupMatch(DEFINITION_GROUP, region, grokPattern); String patternName = groupMatch(PATTERN_GROUP, region, grokPattern); String pattern = patternBank.get(patternName); if (pattern == null) { throw new IllegalArgumentException("Unable to find pattern [" + patternName + "] in Grok's pattern dictionary"); } if (pattern.contains("%{" + patternName + "}") || pattern.contains("%{" + patternName + ":")) { throw new IllegalArgumentException("circular reference in pattern back [" + patternName + "]"); } String grokPart; if (namedCaptures && subName != null) { grokPart = String.format(Locale.US, "(?<%s>%s)", namedPatternRef, pattern); } else if (namedCaptures) { grokPart = String.format(Locale.US, "(?:%s)", pattern); } else { grokPart = String.format(Locale.US, "(?<%s>%s)", patternName + "_" + result, pattern); } String start = new String(grokPatternBytes, 0, result, StandardCharsets.UTF_8); String rest = new String( grokPatternBytes, region.getEnd(0), grokPatternBytes.length - region.getEnd(0), StandardCharsets.UTF_8 ); grokPattern = grokPart + rest; res.append(start); } throw new IllegalArgumentException("Can not convert grok patterns to regular expression"); } /** * Checks whether a specific text matches the defined grok expression. * * @param text the string to match * @return true if grok expression matches text or there is a timeout, false otherwise. */ public boolean match(String text) { Matcher matcher = compiledExpression.matcher(text.getBytes(StandardCharsets.UTF_8)); int result; try { matcherWatchdog.register(matcher); result = matcher.search(0, text.length(), Option.DEFAULT); } finally { matcherWatchdog.unregister(matcher); } return (result != -1); } /** * Matches and returns any named captures. * * @param text the text to match and extract values from. * @return a map containing field names and their respective coerced values that matched or null if the pattern didn't match */ public Map captures(String text) { byte[] utf8Bytes = text.getBytes(StandardCharsets.UTF_8); GrokCaptureExtracter.MapExtracter extracter = new GrokCaptureExtracter.MapExtracter(captureConfig); if (match(utf8Bytes, 0, utf8Bytes.length, extracter)) { return extracter.result(); } return null; } /** * Matches and collects any named captures. * @param utf8Bytes array containing the text to match against encoded in utf-8 * @param offset offset {@code utf8Bytes} of the start of the text * @param length length of the text to match * @param extracter collector for captures. {@link GrokCaptureConfig#nativeExtracter} can build these. * @return true if there was a match, false otherwise * @throws RuntimeException if there was a timeout */ public boolean match(byte[] utf8Bytes, int offset, int length, GrokCaptureExtracter extracter) { Matcher matcher = compiledExpression.matcher(utf8Bytes, offset, offset + length); int result; try { matcherWatchdog.register(matcher); result = matcher.search(offset, length, Option.DEFAULT); } finally { matcherWatchdog.unregister(matcher); } if (result == Matcher.INTERRUPTED) { throw new RuntimeException( "grok pattern matching was interrupted after [" + matcherWatchdog.maxExecutionTimeInMillis() + "] ms" ); } if (result == Matcher.FAILED) { return false; } extracter.extract(utf8Bytes, offset, matcher.getEagerRegion()); return true; } /** * The list of values that this {@linkplain Grok} can capture. */ public List captureConfig() { return captureConfig; } /** * Load built-in patterns. */ private static Map loadBuiltinPatterns() { String[] patternNames = new String[] { "aws", "bacula", "bind", "bro", "exim", "firewalls", "grok-patterns", "haproxy", "java", "junos", "linux-syslog", "maven", "mcollective-patterns", "mongodb", "nagios", "postgresql", "rails", "redis", "ruby", "squid" }; Map builtinPatterns = new LinkedHashMap<>(); for (String pattern : patternNames) { try { try (InputStream is = Grok.class.getResourceAsStream("/patterns/" + pattern)) { loadPatterns(builtinPatterns, is); } } catch (IOException e) { throw new RuntimeException("failed to load built-in patterns", e); } } return Collections.unmodifiableMap(builtinPatterns); } private static void loadPatterns(Map patternBank, InputStream inputStream) throws IOException { String line; BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); while ((line = br.readLine()) != null) { String trimmedLine = line.replaceAll("^\\s+", ""); if (trimmedLine.startsWith("#") || trimmedLine.length() == 0) { continue; } String[] parts = trimmedLine.split("\\s+", 2); if (parts.length == 2) { patternBank.put(parts[0], parts[1]); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy