com.google.appengine.api.search.dev.LuceneUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of appengine-api-stubs
SDK for dev_appserver (local development) with some of the dependencies shaded (repackaged)
There is a newer version: 2.0.31
/*
 * Copyright 2021 Google LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.appengine.api.search.dev;

import com.google.apphosting.api.AppEngineInternal;
import com.google.apphosting.api.search.DocumentPb;
import com.google.apphosting.api.search.DocumentPb.FacetValue;
import com.google.apphosting.api.search.DocumentPb.FieldValue.ContentType;
import com.google.appengine.repackaged.com.google.common.collect.ImmutableSet;
import com.google.appengine.repackaged.com.google.protobuf.InvalidProtocolBufferException;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TimeZone;
import java.util.logging.Logger;
import com.google.appengine.repackaged.org.apache.lucene.analysis.TokenStream;
import com.google.appengine.repackaged.org.apache.lucene.document.AbstractField;
import com.google.appengine.repackaged.org.apache.lucene.document.Document;
import com.google.appengine.repackaged.org.apache.lucene.document.Field;
import com.google.appengine.repackaged.org.apache.lucene.document.Fieldable;
import com.google.appengine.repackaged.org.apache.lucene.document.NumericField;
import com.google.appengine.repackaged.org.apache.lucene.index.Term;
import com.google.appengine.repackaged.org.apache.lucene.search.Query;
import com.google.appengine.repackaged.org.apache.lucene.search.TermQuery;
import org.jsoup.Jsoup;

/** Various utilities to interface with Lucene. */
@AppEngineInternal
public final class LuceneUtils {

  /**
   * The name of the field under which we store tokens for all fields, so
   * that we can search for them without a field prefix.
   */
  public static final String FIELDLESS_FIELD_NAME = "_GLOBAL";

  /**
   * The name of the field under which we store the AppEngine document ID.
   */
  static final String DOCID_FIELD_NAME = "_DOCID";

  /**
   * The name of the field under which we store the AppEngine original document.
   */
  static final String ORIGINAL_DOC_FIELD_NAME = "_ORIGINALDOC";

  /**
   * The name of the field under which we store a value that allows us
   * to search for all documents.
   */
  static final String ALLDOCS_FIELD_NAME = "_ALLDOC";

  /**
   * The name of the field under which we store the document's locale code.
   */
  static final String LOCALE_FIELD_NAME = "_LOCALE";

  /**
   * The token stored with each document to allows us to find all documents.
   */
  static final String ALLDOCS_FIELD_VALUE = "X";

  /**
   * This token in not stored in the ALLDOCS_FIELD_NAME field. It is used to
   * construct queries matching none.
   */
  static final String ALLDOCS_FIELD_MISSING_VALUE = "Y";

  /** The field that stores order ID. */
  static final String ORDER_ID_FIELD_NAME = "_rank";

  private static final Logger log = Logger.getLogger(LuceneUtils.class.getName());

  static final String CONVERTED_HTML_TYPE = "HTML2TEXT";

  public static final long MSEC_PER_DAY = 86400000L;

  /**
   * Prefix string for a Lucene field that represents a facet.
   */
  static final String FACET_NAME_PREFIX = "facet_";

  /**
   * Prefix string for a Lucene field that represents a Search API field.
   * Note: kept as empty string for backward compatibility.
   */
  static final String FIELD_NAME_PREFIX = "";

  /**
   * Word separator characters. This is package-private for testing.
   */
  static final ImmutableSet WORD_SEPARATORS = ImmutableSet.of(
      '!', '"', '%', '(', ')', '*', ',', '.', '/', ':', '=', '>', '?', '@', '[', '\\', ']', '^',
      '`', '{', '|', '}', '~', '\t', '\n', '\f', '\r', ' ', '&', '#', '$', ';');

   /**
   * The percentage of characters that must be CJK for us to tokenize the string as CJK.
   *
   * This should be set fairly low (certainly no higher than 50%, probably
   * closer to 20%); it's only purpose is to make sure mostly-latin text with a
   * few CJK characters is still segmented as latin.
   */
  private static final float CJK_CHARACTER_THRESHOLD = 0.2f;
  
  /**
   * Set of all CJK Unicode Blocks.
   */
  private static final ImmutableSet CJK_BLOCKS = ImmutableSet.of(
      Character.UnicodeBlock.BOPOMOFO,
      Character.UnicodeBlock.BOPOMOFO_EXTENDED,
      Character.UnicodeBlock.CJK_COMPATIBILITY,
      Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS,
      Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS,
      Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
      Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT,
      Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS,
      Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
      Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
      Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS,
      Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS,
      Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO,
      Character.UnicodeBlock.HANGUL_JAMO,
      Character.UnicodeBlock.HANGUL_SYLLABLES,
      Character.UnicodeBlock.HIRAGANA,
      Character.UnicodeBlock.IDEOGRAPHIC_DESCRIPTION_CHARACTERS,
      Character.UnicodeBlock.KANBUN,
      Character.UnicodeBlock.KANGXI_RADICALS,
      Character.UnicodeBlock.KATAKANA,
      Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS,
      Character.UnicodeBlock.TAI_XUAN_JING_SYMBOLS,
      Character.UnicodeBlock.YI_RADICALS,
      Character.UnicodeBlock.YI_SYLLABLES,
      Character.UnicodeBlock.YIJING_HEXAGRAM_SYMBOLS
      );

  /**
   * The UTC time zone.
   */
  private static final ThreadLocal UTC_TZ =
      new ThreadLocal() {
        @Override protected TimeZone initialValue() {
          return TimeZone.getTimeZone("UTC");
        }
      };

  private static DateFormat getDateFormat(String formatString) {
    DateFormat format = new SimpleDateFormat(formatString, Locale.US);
    format.setTimeZone(UTC_TZ.get());
    return format;
  }

  private static final ThreadLocal ISO8601_SIMPLE =
      new ThreadLocal() {
        @Override protected DateFormat initialValue() {
          return getDateFormat("yyyy-MM-dd");
        }
      };

  public static String makeLuceneFieldName(DocumentPb.Field field) {
    return makeLuceneFieldName(field.getName(), field.getValue().getType());
  }

  public static String makeLuceneFieldName(String name, DocumentPb.FieldValue.ContentType type) {
    return FIELD_NAME_PREFIX + type + "@" + name;
  }

  static String makeLuceneFieldName(String name, FacetValue.ContentType contentType) {
    return FACET_NAME_PREFIX + contentType + "@" + name;
  }

  public static String makeLuceneFieldNameWithExtractedText(
      String name, DocumentPb.FieldValue.ContentType type) {
    if (type == ContentType.HTML) {
      return CONVERTED_HTML_TYPE + "@" + name;
    }
    return makeLuceneFieldName(name, type);
  }

  public static String makeLuceneFieldNameWithExtractedText(DocumentPb.Field field) {
    return makeLuceneFieldNameWithExtractedText(field.getName(), field.getValue().getType());
  }

  public static String extractTextFromHtml(String html) {
    org.jsoup.nodes.Document doc = Jsoup.parse(html);
    org.jsoup.nodes.Element body = doc.body();
    return body != null ? body.text() : "";
  }

  /**
   * Locale-aware Double parsing.
   *
   * Double.valueOf(String) fails if your locale uses commas as decimal separators. This takes that
   * into account.
   */
  public static double stringValueToDouble(String value) {
    try {
      return NumberFormat.getNumberInstance().parse(value).doubleValue();
    } catch (ParseException e) {
      throw new RuntimeException(e);
    }
  }

  static AbstractField convertFacetToLuceneField(DocumentPb.Facet facet) {
    DocumentPb.FacetValue facetValue = facet.getValue();
    String facetName = makeLuceneFieldName(facet.getName(), facet.getValue().getType());
    String value = facet.getValue().getStringValue();
    switch(facetValue.getType()) {
      case ATOM:
        return new Field(facetName, value, Field.Store.YES, Field.Index.NOT_ANALYZED);
      case NUMBER:
        NumericField numericField = new NumericField(facetName, Field.Store.YES, true);
        numericField.setDoubleValue(stringValueToDouble(value));
        return numericField;
      default:
        throw new IllegalArgumentException("Facet type " + facetValue.getType() + " not handled");
    }
  }

  static boolean isFacetField(Fieldable field) {
    return field.name().startsWith(FACET_NAME_PREFIX);
  }

  static DocumentPb.Facet convertLuceneFieldToFacet(Fieldable field) {
    if (!isFacetField(field)) {
      throw new IllegalArgumentException(
          String.format("Field %s is not a facet field", field.name()));
    }
    String typeName = field.name().substring(LuceneUtils.FACET_NAME_PREFIX.length());
    int atIndex = typeName.indexOf("@");
    return DocumentPb.Facet.newBuilder().setName(typeName.substring(atIndex + 1))
        .setValue(DocumentPb.FacetValue.newBuilder().setStringValue(field.stringValue()).setType(
            DocumentPb.FacetValue.ContentType.valueOf(typeName.substring(0, atIndex))).build())
        .build();
  }
  
 
  public static List toLuceneFields(DocumentPb.Field field) {
    List output = new ArrayList<>();
    DocumentPb.FieldValue fieldValue = field.getValue();
    Field.Index globalIndexStrategy = Field.Index.ANALYZED;
    String fieldName = makeLuceneFieldName(field);
    String value = fieldValue.getStringValue();
    boolean makeGlobalField = true;

    switch (fieldValue.getType()) {
      case HTML:
        // Store original html
        output.add(new Field(fieldName, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
        value = extractTextFromHtml(value);
        output.add(new Field(makeLuceneFieldNameWithExtractedText(field), value,
            Field.Store.YES, Field.Index.ANALYZED));
        break;

      case TEXT:
        output.add(new Field(fieldName, value, Field.Store.YES, globalIndexStrategy));
        break;

      case ATOM:
        value = value.toLowerCase();
        output.add(new Field(fieldName, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
        globalIndexStrategy = Field.Index.NOT_ANALYZED;
        break;

      case UNTOKENIZED_PREFIX:
        globalIndexStrategy = Field.Index.NOT_ANALYZED;
        value = PrefixFieldAnalyzerUtil.normalizePrefixField(value);
        for (String prefix : PrefixFieldAnalyzerUtil.createUntokenizedPrefixes(value)) {
          output.add(new Field(fieldName, prefix, Field.Store.NO, Field.Index.NOT_ANALYZED));
        }
        makeGlobalField = false;
        break;

      case TOKENIZED_PREFIX:
        TokenStream stream = PrefixFieldAnalyzerUtil.getTokenizedPrefixTokenStreamForIndexing(
            new StringReader(value)); 
        output.add(new Field(fieldName, stream));
        makeGlobalField = false;
        break;

      case DATE:
        NumericField dateField = new NumericField(fieldName, Field.Store.YES, true);
        // Store date as long value of days since Jan 1 1970
        try {
          long days = dateStringToLong(value) / MSEC_PER_DAY;
          value = Long.toString(days);
          dateField.setLongValue(days);
        } catch (ParseException e) {
          log.warning("Failed to parse date for " + fieldName + ": " + value);
          dateField.setLongValue(0L);
        }
        output.add(dateField);
        globalIndexStrategy = Field.Index.NOT_ANALYZED;
        break;

      case NUMBER:
        // TODO: Lucene docs insist on reusing the same NumericFields across documents.
        NumericField numericField = new NumericField(fieldName, Field.Store.YES, true);
        numericField.setDoubleValue(stringValueToDouble(value));
        output.add(numericField);
        globalIndexStrategy = Field.Index.NOT_ANALYZED;
        break;

      case GEO:
        output.add(new GeometricField(fieldName, fieldValue.getGeo()));
        makeGlobalField = false;
        break;

      default:
        throw new IllegalArgumentException("Field type " + fieldValue.getType() + " not handled");
    }

    if (makeGlobalField) {
      output.add(new Field(FIELDLESS_FIELD_NAME, value, Field.Store.NO, globalIndexStrategy));
    }
    return output;
  }

  public static Document toLuceneDocument(String docId, DocumentPb.Document input) {
    Document output = new Document();
    // Don't store the locale if the user didn't set it explicitly.
    if (input.hasLanguage()) {
      // Store the locale in an unindexed field
      output.add(
          new Field(LOCALE_FIELD_NAME, input.getLanguage(), Field.Store.YES, Field.Index.NO));
    }

    for (DocumentPb.Facet facet : input.getFacetList()) {
      output.add(convertFacetToLuceneField(facet));
    }

    for (DocumentPb.Field field : input.getFieldList()) {
      for (AbstractField luceneField : toLuceneFields(field)) {
        output.add(luceneField);
       }
    }

    // Special fields, added to each document. We add the following:
    //   a field that allows us to match all documents
    //   a field that keeps AppEngine document's docId
    //   a field that keeps the original AppEngine document
    //   a field that keeps an encoded map from field names to field types
    output.add(new Field(ALLDOCS_FIELD_NAME,
        ALLDOCS_FIELD_VALUE, Field.Store.NO, Field.Index.NOT_ANALYZED));
    output.add(new Field(DOCID_FIELD_NAME, docId,
        Field.Store.YES, Field.Index.NOT_ANALYZED));
    output.add(new Field(ORIGINAL_DOC_FIELD_NAME, input.toByteArray(),
        Field.Store.YES));
    output.add(new Field(ORDER_ID_FIELD_NAME, Integer.toString(input.getOrderId()),
        Field.Store.YES, Field.Index.NOT_ANALYZED));
    return output;
  }

  /**
   * Heuristically guesses whether the data in reader is in a CJK language.
   *
   * @param reader The data to tokenize.
   * @param readerContents The data in reader will be copied to readerContents
   * so that the caller can also use the data in reader.
   *
   * @return True if the text contains more than CJK_CHARACTER_THRESHOLD-percent Chinese, Japanese,
   * or Korean characters.
   *
   * @throws IOException if reading from the reader throws an IOException when read()-ing.
   */
  static boolean isProbablyCjk(Reader reader, StringBuilder readerContents) throws IOException {
    /* Buffer of size 1024 used because most documents (95%) are under 1kb in
     * size and thus will fit into the buffer with a single read. */
    char[] buffer = new char[1024];
    long cjkChars = 0L;
    long totalChars = 0L;

    /* Can't use reader.ready(), because it's possible that reader.ready() will return false when
     * there's still more data but it's not ready to be read yet (because of buffering or some other
     * source of asynchrony). reader.read() will block if necessary, and return -1 if there's no
     * more data to read, so we use that to control the loop instead. */
    while (true) {
      int len = reader.read(buffer);
      if (len < 0) {
        break;
      }
      totalChars += len;
      readerContents.append(buffer, 0, len);

      for (int i = 0; i < len; i++) {
        if (CJK_BLOCKS.contains(Character.UnicodeBlock.of(buffer[i]))) {
          cjkChars++;
        }
      }
    }

    return (float) cjkChars / totalChars > CJK_CHARACTER_THRESHOLD;
  }

  public static Long dateStringToLong(String value) throws ParseException {
    try {
      return Long.parseLong(value);
    } catch (IllegalArgumentException exception) {
      // TODO: delete support for passing ISO 8601 formatted strings when
      // have switched over to storing milliseconds.
      return ISO8601_SIMPLE.get().parse(value).getTime();
    }
  }

  /**
   * Checks whether provided string is an ISO-8601 date.
   */
  public static boolean isDateString(String value) {
    try {
      ISO8601_SIMPLE.get().parse(value);
      return true;
    } catch (ParseException e) {
      return false;
    }
  }

  public static double numericFieldToDouble(Fieldable f) {
    // Lucene returns Field instead of NumericField in getFields() call
    // for documents in index.
    if (f instanceof NumericField) {
      NumericField numericField = (NumericField) f;
      return numericField.getNumericValue().doubleValue();
    } else {
      return Double.parseDouble(f.stringValue());
    }
  }

  public static Object luceneFieldToValue(Fieldable f, ContentType type) {
    switch (type) {
      case TEXT:
      case HTML:
      case ATOM:
        return ((Field) f).stringValue();
      case DATE:
        // Lucene returns Field instead of NumericField in getFields() call
        // for documents in index.
        long value;
        if (f instanceof NumericField) {
          value = ((NumericField) f).getNumericValue().longValue();
        } else {
          value = Long.parseLong(f.stringValue());
        }
        return Long.toString(value);
      case NUMBER:
          // Lucene returns Field instead of NumericField in getFields() call
          // for documents in index.
          if (f instanceof NumericField) {
            NumericField numericField = (NumericField) f;
            return Double.toString(numericField.getNumericValue().doubleValue());
          } else {
            return f.stringValue();
          }
      case GEO:
        String[] parts = ((Field) f).stringValue().split(",", 2);
        return new double[] { Double.parseDouble(parts[0]), Double.parseDouble(parts[1]) };
      default:
        throw new IllegalArgumentException(
            "Failed to correctly handle type " + type);
    }
  }

  static DocumentPb.Document.Builder toAppengineDocumentIdBuilder(Document d) {
    String docId = ((Field) d.getFieldable(DOCID_FIELD_NAME)).stringValue();
    DocumentPb.Document.Builder docBuilder = DocumentPb.Document.newBuilder();
    docBuilder.setId(docId);
    return docBuilder;
  }

  public static DocumentPb.Document toAppengineDocumentId(Document d) {
    return toAppengineDocumentIdBuilder(d).build();
  }

  static DocumentPb.Document.Builder toAppengineDocumentBuilder(Document d)
      throws InvalidProtocolBufferException {
    Fieldable doc = d.getFieldable(ORIGINAL_DOC_FIELD_NAME);
    if (doc == null) {
      return null;
    }
    DocumentPb.Document.Builder docBuilder = DocumentPb.Document.newBuilder();
    docBuilder.mergeFrom(doc.getBinaryValue(), doc.getBinaryOffset(), doc.getBinaryLength());
    return docBuilder;
  }

  public static DocumentPb.Document toAppengineDocument(Document d)
      throws InvalidProtocolBufferException {
    DocumentPb.Document.Builder docBuilder = toAppengineDocumentBuilder(d);
    if (docBuilder != null) {
      return docBuilder.build();
    } else {
      return null;
    }
  }

  public static Query getMatchAnyDocumentQuery() {
    return new TermQuery(new Term(ALLDOCS_FIELD_NAME, ALLDOCS_FIELD_VALUE));
  }

  public static Query getMatchNoneQuery() {
    return new TermQuery(new Term(ALLDOCS_FIELD_NAME, ALLDOCS_FIELD_MISSING_VALUE));
  }

  public static Term newDeleteTerm(String docId) {
    return new Term(DOCID_FIELD_NAME, docId);
  }
}