org.apache.lucene.analysis.hunspell.SuggestibleEntryCache Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-analysis-common Show documentation
Apache Lucene (module: common)
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.hunspell;

import java.util.function.Consumer;
import org.apache.lucene.internal.hppc.IntObjectHashMap;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;

/**
 * A cache allowing for CPU-cache-friendlier iteration over {@link WordStorage} entries that can be
 * used for suggestions. The words and the form data are stored in plain contiguous arrays with no
 * compression.
 */
class SuggestibleEntryCache {
  private static final short LOWER_CASE = (short) WordCase.LOWER.ordinal();
  private static final short NEUTRAL_CASE = (short) WordCase.NEUTRAL.ordinal();
  private static final short TITLE_CASE = (short) WordCase.TITLE.ordinal();

  private final Section[] sections;

  private SuggestibleEntryCache(IntObjectHashMap builders, int maxLength) {
    sections = new Section[maxLength + 1];
    for (int i = 0; i < sections.length; i++) {
      SectionBuilder builder = builders.get(i);
      sections[i] = builder == null ? null : builder.build(i);
    }
  }

  static SuggestibleEntryCache buildCache(WordStorage storage) {
    var consumer =
        new Consumer() {
          final IntObjectHashMap builders = new IntObjectHashMap<>();
          int maxLength;

          @Override
          public void accept(FlyweightEntry entry) {
            CharsRef root = entry.root();
            if (root.length > Short.MAX_VALUE) {
              throw new UnsupportedOperationException(
                  "Too long dictionary entry, please report this to [email protected]");
            } else if (root.length > maxLength) {
              maxLength = root.length;
            }

            SectionBuilder builder;
            int index = builders.indexOf(root.length);
            if (index < 0) {
              builder = new SectionBuilder();
              builders.indexInsert(index, root.length, builder);
            } else {
              builder = builders.indexGet(index);
            }
            builder.add(entry);
          }
        };
    storage.processSuggestibleWords(1, Integer.MAX_VALUE, consumer);

    return new SuggestibleEntryCache(consumer.builders, consumer.maxLength);
  }

  private static class SectionBuilder {
    final StringBuilder roots = new StringBuilder(), lowRoots = new StringBuilder();
    short[] meta = new short[10];
    int[] formData = new int[10];
    int metaOffset, formDataOffset;

    void add(FlyweightEntry entry) {
      CharsRef root = entry.root();
      if (root.length > Short.MAX_VALUE) {
        throw new UnsupportedOperationException(
            "Too long dictionary entry, please report this to [email protected]");
      }

      IntsRef forms = entry.forms();

      short rootCase = (short) WordCase.caseOf(root).ordinal();

      meta = ArrayUtil.grow(meta, metaOffset + 2);
      meta[metaOffset] = (short) forms.length;
      meta[metaOffset + 1] = rootCase;
      metaOffset += 2;

      lowRoots.append(entry.lowerCaseRoot());
      if (hasUpperCase(rootCase)) {
        roots.append(root.chars, root.offset, root.length);
      }

      formData = ArrayUtil.grow(formData, formDataOffset + forms.length);
      System.arraycopy(forms.ints, forms.offset, formData, formDataOffset, forms.length);
      formDataOffset += forms.length;
    }

    Section build(int rootLength) {
      return new Section(
          rootLength,
          ArrayUtil.copyOfSubArray(meta, 0, metaOffset),
          roots.toString().toCharArray(),
          lowRoots.toString().toCharArray(),
          ArrayUtil.copyOfSubArray(formData, 0, formDataOffset));
    }
  }

  private static boolean hasUpperCase(short rootCase) {
    return rootCase != LOWER_CASE && rootCase != NEUTRAL_CASE;
  }

  void processSuggestibleWords(int minLength, int maxLength, Consumer processor) {
    maxLength = Math.min(maxLength, sections.length - 1);
    for (int i = Math.min(minLength, sections.length); i <= maxLength; i++) {
      Section section = sections[i];
      if (section != null) {
        section.processWords(processor);
      }
    }
  }

  /**
   * @param meta The lengths of the entry sub-arrays in formData plus the case information
   * @param roots original roots if they're not all-lowercase
   */
  private record Section(
      int rootLength, short[] meta, char[] roots, char[] lowRoots, int[] formData) {

    void processWords(Consumer processor) {
      CharsRef chars = new CharsRef(roots, 0, Math.min(rootLength, roots.length));
      CharsRef lowerChars = new CharsRef(lowRoots, 0, rootLength);
      IntsRef forms = new IntsRef(formData, 0, 0);

      var entry =
          new FlyweightEntry() {
            short wordCase;

            @Override
            CharsRef root() {
              return hasUpperCase(wordCase) ? chars : lowerChars;
            }

            @Override
            boolean hasTitleCase() {
              return wordCase == TITLE_CASE;
            }

            @Override
            CharSequence lowerCaseRoot() {
              return lowerChars;
            }

            @Override
            IntsRef forms() {
              return forms;
            }
          };

      for (int i = 0; i < meta.length; i += 2) {
        short formDataLength = meta[i];
        short wordCase = meta[i + 1];
        forms.length = formDataLength;
        entry.wordCase = wordCase;
        processor.accept(entry);

        lowerChars.offset += rootLength;
        if (hasUpperCase(wordCase)) {
          chars.offset += rootLength;
        }
        forms.offset += formDataLength;
      }
    }
  }
}