com.couchbase.connect.kafka.util.DocumentPathExtractor Maven / Gradle / Ivy

Go to download
/*
 * Copyright 2017 Couchbase, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.couchbase.connect.kafka.util;

import com.couchbase.client.core.deps.com.fasterxml.jackson.core.JsonFactory;
import com.couchbase.client.core.deps.com.fasterxml.jackson.core.JsonParser;
import com.couchbase.client.core.deps.com.fasterxml.jackson.core.JsonPointer;
import com.couchbase.client.core.deps.com.fasterxml.jackson.core.filter.FilteringParserDelegate;
import com.couchbase.client.core.deps.com.fasterxml.jackson.core.filter.JsonPointerBasedFilter;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.couchbase.client.core.deps.com.fasterxml.jackson.core.filter.TokenFilter.Inclusion.ONLY_INCLUDE_ALL;
import static java.util.Collections.unmodifiableMap;
import static java.util.Objects.requireNonNull;

/**
 * Locates a document ID using a JSON pointer, optionally removing the ID from the document.
 * 
 * Immutable.
 */
public class DocumentPathExtractor {

  public static class DocumentPathNotFoundException extends Exception {
    DocumentPathNotFoundException(String message) {
      super(message);
    }
  }

  public static class DocumentExtraction {
    private final String pathValue;
    private final byte[] data;

    DocumentExtraction(String pathValue, byte[] data) {
      this.pathValue = pathValue;
      this.data = requireNonNull(data);
    }

    public String getPathValue() {
      return pathValue;
    }

    public byte[] getData() {
      return data;
    }
  }

  private static class ByteRange {
    private final byte[] bytes;
    private int startOffset;
    private int pastEndOffset;

    private ByteRange(byte[] bytes, int startOffset, int pastEndOffset) {
      this.bytes = bytes;
      this.startOffset = startOffset;
      this.pastEndOffset = pastEndOffset;
    }

    private static ByteRange forCurrentToken(byte[] bytes, JsonParser parser) {
      return new ByteRange(bytes,
          (int) parser.getTokenLocation().getByteOffset(),
          (int) parser.getCurrentLocation().getByteOffset());
    }

    @Override
    public String toString() {
      return "[" + startOffset + "," + pastEndOffset + ") = |" + new String(bytes, startOffset, pastEndOffset - startOffset) + "|";
    }

    void fill(byte[] bytes, byte fillByte) {
      Arrays.fill(bytes, startOffset, pastEndOffset, fillByte);
    }
  }

  private static final JsonFactory factory = new JsonFactory();
  private final String documentPathFormat;
  private final Map placeholderToJsonPointer;

  private final boolean removeDocumentPath;

  private static final Pattern PLACEHOLDER_PATTERN = Pattern.compile("\\$\\{(.+?)}");

  public DocumentPathExtractor(String documentPathFormat, boolean removeDocumentPath) {
    if (documentPathFormat.isEmpty()) {
      throw new IllegalArgumentException("Document ID format must not be empty");
    }

    Matcher m = PLACEHOLDER_PATTERN.matcher(documentPathFormat);
    if (!m.find()) {
      // For backwards compatibility, treat the whole thing as a single JSON pointer
      documentPathFormat = "${" + documentPathFormat + "}";
      m = PLACEHOLDER_PATTERN.matcher(documentPathFormat);
      if (!m.find()) {
        // Shouldn't happen, since we just added the placeholder delimiters
        throw new AssertionError("invalid document ID format string");
      }
    }

    Map placeholderToJsonPointerTemp = new HashMap<>();
    do {
      final String placeholder = m.group();
      final String jsonPointer = m.group(1);
      placeholderToJsonPointerTemp.put(placeholder, JsonPointer.compile(jsonPointer));
    } while (m.find());

    this.placeholderToJsonPointer = unmodifiableMap(placeholderToJsonPointerTemp);
    this.documentPathFormat = documentPathFormat;
    this.removeDocumentPath = removeDocumentPath;
  }

  /**
   * @param json The document content encoded as UTF-8. If this method returns normally,
   * it may modify the contents of the array to remove the fields used by the document ID.
   */
  public DocumentExtraction extractDocumentPath(final byte[] json) throws IOException, DocumentPathNotFoundException {
    final List rangesToRemove = new ArrayList<>(placeholderToJsonPointer.size());

    String documentId = documentPathFormat;

    for (Map.Entry idComponent : placeholderToJsonPointer.entrySet()) {
      final String placeholder = idComponent.getKey();
      final JsonPointer documentIdPointer = idComponent.getValue();

      final JsonParser parser = new FilteringParserDelegate(
          factory.createParser(json), new JsonPointerBasedFilter(documentIdPointer), ONLY_INCLUDE_ALL, false);

      if (parser.nextToken() == null) {
        throw new DocumentPathNotFoundException("Document has no value matching JSON pointer '" + documentIdPointer + "'");
      }

      final String component = parser.getValueAsString();
      if (component == null) {
        throw new DocumentPathNotFoundException("The value matching JSON pointer '" + documentIdPointer + "' is null or non-scalar.");
      }

      documentId = documentId.replace(placeholder, component);

      if (removeDocumentPath) {
        rangesToRemove.add(ByteRange.forCurrentToken(json, parser));
      }
    }

    // At this point we're sure DocumentIdNotFoundException wasn't thrown, and we can expect
    // this method to return normally. It is finally safe to modify the document content.
    for (ByteRange range : rangesToRemove) {
      swallowFieldName(range);
      swallowOneComma(range);
      range.fill(json, (byte) ' ');
    }

    return new DocumentExtraction(documentId, json);
  }

  private static void swallowOneComma(ByteRange range) {
    swallowWhitespace(range);

    if (range.bytes[range.pastEndOffset] == ',') {
      range.pastEndOffset++;

    } else if (range.bytes[range.startOffset - 1] == ',') {
      range.startOffset--;
    }
  }

  private static void swallowWhitespace(ByteRange range) {
    swallowWhitespaceLeft(range);
    swallowWhitespaceRight(range);
  }

  private static void swallowWhitespaceLeft(ByteRange range) {
    while (isJsonWhitespace(range.bytes[range.startOffset - 1])) {
      range.startOffset--;
    }
  }

  private static void swallowWhitespaceRight(ByteRange range) {
    while (isJsonWhitespace(range.bytes[range.pastEndOffset])) {
      range.pastEndOffset++;
    }
  }

  private static void swallowFieldName(ByteRange range) {
    swallowWhitespaceLeft(range);

    // If the target was a field, then prevChar will be the colon (:) that separates the field name from value.
    // If the target was an array element, then prevChar will be the array start token ([) or the comma (,)
    // separating the target from the previous array element.

    byte prevChar = range.bytes[range.startOffset - 1];
    if (prevChar == ':') {
      range.startOffset--; // swallow colon
      swallowWhitespaceLeft(range);
      range.startOffset--; // swallow field name closing quote

      // swallow left to include field name opening quote (guaranteed to not be preceded by backslash)
      do {
        range.startOffset--;
      } while (!(range.bytes[range.startOffset] == '"' && range.bytes[range.startOffset - 1] != '\\'));
    }
  }

  private static boolean isJsonWhitespace(byte b) {
    switch (b) {
      case 0x20: // Space
      case 0x09: // Horizontal tab
      case 0x0A: // LF
      case 0x0D: // CR
        return true;
      default:
        return false;
    }
  }
}