All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.admin.IndexSizeEstimator Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.admin;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StandardDirectoryReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SuppressForbidden;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Estimates the raw size of all uncompressed indexed data by scanning term, docValues and stored
 * fields data. This utility also provides detailed statistics about term, docValues, postings and
 * stored fields distributions.
 */
public class IndexSizeEstimator {
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  public static final String TERMS = "terms";
  public static final String STORED_FIELDS = "storedFields";
  public static final String NORMS = "norms";
  public static final String DOC_VALUES = "docValues";
  public static final String POINTS = "points";
  public static final String TERM_VECTORS = "termVectors";
  public static final String SUMMARY = "summary";
  public static final String DETAILS = "details";
  public static final String FIELDS_BY_SIZE = "fieldsBySize";
  public static final String TYPES_BY_SIZE = "typesBySize";

  public static final int DEFAULT_SAMPLING_THRESHOLD = 100_000;
  public static final float DEFAULT_SAMPLING_PERCENT = 5.0f;

  private final IndexReader reader;
  private final int topN;
  private final int maxLength;
  private final boolean withSummary;
  private final boolean withDetails;
  private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
  private float samplingPercent = DEFAULT_SAMPLING_PERCENT;
  private int samplingStep = 1;

  public static final class Estimate implements MapWriter {
    private final Map fieldsBySize;
    private final Map typesBySize;
    private final Map summary;
    private final Map details;

    public Estimate(
        Map fieldsBySize,
        Map typesBySize,
        Map summary,
        Map details) {
      Objects.requireNonNull(fieldsBySize);
      Objects.requireNonNull(typesBySize);
      this.fieldsBySize = fieldsBySize;
      this.typesBySize = typesBySize;
      this.summary = summary;
      this.details = details;
    }

    public Map getFieldsBySize() {
      return fieldsBySize;
    }

    public Map getTypesBySize() {
      return typesBySize;
    }

    public Map getHumanReadableFieldsBySize() {
      LinkedHashMap result = new LinkedHashMap<>();
      fieldsBySize.forEach(
          (field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
      return result;
    }

    public Map getHumanReadableTypesBySize() {
      LinkedHashMap result = new LinkedHashMap<>();
      typesBySize.forEach(
          (field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
      return result;
    }

    public Map getSummary() {
      return summary;
    }

    public Map getDetails() {
      return details;
    }

    @Override
    public void writeMap(EntryWriter ew) throws IOException {
      ew.put(FIELDS_BY_SIZE, fieldsBySize);
      ew.put(TYPES_BY_SIZE, typesBySize);
      if (summary != null) {
        ew.put(SUMMARY, summary);
      }
      if (details != null) {
        ew.put(DETAILS, details);
      }
    }
  }

  public IndexSizeEstimator(
      IndexReader reader, int topN, int maxLength, boolean withSummary, boolean withDetails) {
    this.reader = reader;
    this.topN = topN;
    this.maxLength = maxLength;
    this.withSummary = withSummary;
    this.withDetails = withDetails;
  }

  /**
   * Set the sampling threshold. If the index has more documents than this threshold then only some
   * values will be sampled and the totals will be extrapolated.
   *
   * @param threshold size threshold (number of documents). Default value is {@link
   *     #DEFAULT_SAMPLING_THRESHOLD}. Setting this to values <= 0 means no threshold (and no
   *     sampling).
   */
  public void setSamplingThreshold(int threshold) {
    if (threshold <= 0) {
      threshold = Integer.MAX_VALUE;
    }
    this.samplingThreshold = threshold;
  }

  /**
   * Sampling percent (a number greater than 0 and less or equal to 100). When index size exceeds
   * the threshold then approximately only this percent of data will be retrieved from the index and
   * the totals will be extrapolated.
   *
   * @param percent sample percent. Default value is {@link #DEFAULT_SAMPLING_PERCENT}.
   * @throws IllegalArgumentException when value is less than or equal to 0.0 or greater than 100.0,
   *     or the sampling percent is so small that less than 10 documents would be sampled.
   */
  public void setSamplingPercent(float percent) throws IllegalArgumentException {
    if (percent <= 0 || percent > 100) {
      throw new IllegalArgumentException("samplingPercent must be 0 < percent <= 100");
    }
    if (reader.maxDoc() > samplingThreshold) {
      samplingStep = Math.round(100.0f / samplingPercent);
      if (log.isInfoEnabled()) {
        log.info(
            "- number of documents {} larger than {}, sampling percent is {} and sampling step {}",
            reader.maxDoc(),
            samplingThreshold,
            samplingPercent,
            samplingStep);
      }
      if (reader.maxDoc() / samplingStep < 10) {
        throw new IllegalArgumentException(
            "Out of "
                + reader.maxDoc()
                + " less than 10 documents would be sampled, which is too unreliable. Increase the samplingPercent.");
      }
    }
    this.samplingPercent = percent;
  }

  @SuppressWarnings({"unchecked"})
  public Estimate estimate() throws Exception {
    Map details = new LinkedHashMap<>();
    Map summary = new LinkedHashMap<>();
    estimateStoredFields(details);
    estimateTerms(details);
    estimateNorms(details);
    estimatePoints(details);
    estimateTermVectors(details);
    estimateDocValues(details);
    estimateSummary(details, summary);
    if (samplingStep > 1) {
      details.put("samplingPercent", samplingPercent);
      details.put("samplingStep", samplingStep);
    }
    ItemPriorityQueue fieldSizeQueue = new ItemPriorityQueue(summary.size());
    summary.forEach(
        (field, perField) -> {
          long size = ((AtomicLong) ((Map) perField).get("totalSize")).get();
          if (size > 0) {
            fieldSizeQueue.insertWithOverflow(new Item(field, size));
          }
        });
    Map fieldsBySize = new LinkedHashMap<>();
    fieldSizeQueue._forEachEntry((k, v) -> fieldsBySize.put((String) k, (Long) v));
    Map typeSizes = new HashMap<>();
    summary.forEach(
        (field, perField) -> {
          Map perType =
              (Map) ((Map) perField).get("perType");
          perType.forEach(
              (type, size) -> {
                if (type.contains("_lengths")) {
                  AtomicLong totalSize =
                      typeSizes.computeIfAbsent(
                          type.replace("_lengths", ""), t -> new AtomicLong());
                  totalSize.addAndGet(((AtomicLong) size).get());
                }
              });
        });
    ItemPriorityQueue typesSizeQueue = new ItemPriorityQueue(typeSizes.size());
    typeSizes.forEach(
        (type, size) -> {
          if (size.get() > 0) {
            typesSizeQueue.insertWithOverflow(new Item(type, size.get()));
          }
        });
    Map typesBySize = new LinkedHashMap<>();
    typesSizeQueue._forEachEntry((k, v) -> typesBySize.put((String) k, (Long) v));
    // sort summary by field size
    Map newSummary = new LinkedHashMap<>();
    fieldsBySize.keySet().forEach(k -> newSummary.put(String.valueOf(k), summary.get(k)));
    // convert everything to maps and primitives
    convert(newSummary);
    convert(details);
    return new Estimate(
        fieldsBySize, typesBySize, withSummary ? newSummary : null, withDetails ? details : null);
  }

  @SuppressWarnings({"unchecked"})
  private void convert(Map result) {
    for (Map.Entry entry : result.entrySet()) {
      Object value = entry.getValue();
      if (value instanceof ItemPriorityQueue) {
        ItemPriorityQueue queue = (ItemPriorityQueue) value;
        Map map = new LinkedHashMap<>();
        queue.toMap(map);
        entry.setValue(map);
      } else if (value instanceof MapWriterSummaryStatistics) {
        MapWriterSummaryStatistics stats = (MapWriterSummaryStatistics) value;
        Map map = new LinkedHashMap<>();
        stats.toMap(map);
        entry.setValue(map);
      } else if (value instanceof AtomicLong) {
        entry.setValue(((AtomicLong) value).longValue());
      } else if (value instanceof Map) {
        // recurse
        convert((Map) value);
      }
    }
  }

  @SuppressWarnings({"unchecked"})
  private void estimateSummary(Map details, Map summary) {
    log.info("- preparing summary...");
    details.forEach(
        (type, perType) -> {
          ((Map) perType)
              .forEach(
                  (field, perField) -> {
                    Map perFieldSummary =
                        (Map) summary.computeIfAbsent(field, f -> new HashMap<>());
                    ((Map) perField)
                        .forEach(
                            (k, val) -> {
                              if (val instanceof SummaryStatistics) {
                                SummaryStatistics stats = (SummaryStatistics) val;
                                if (k.startsWith("lengths")) {
                                  AtomicLong total =
                                      (AtomicLong)
                                          perFieldSummary.computeIfAbsent(
                                              "totalSize", kt -> new AtomicLong());
                                  total.addAndGet((long) stats.getSum());
                                }
                                Map perTypeSummary =
                                    (Map)
                                        perFieldSummary.computeIfAbsent(
                                            "perType", pt -> new HashMap<>());
                                AtomicLong total =
                                    (AtomicLong)
                                        perTypeSummary.computeIfAbsent(
                                            type + "_" + k, t -> new AtomicLong());
                                total.addAndGet((long) stats.getSum());
                              }
                            });
                  });
        });
  }

  private void estimateNorms(Map result) throws IOException {
    log.info("- estimating norms...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext leafReaderContext : reader.leaves()) {
      LeafReader leafReader = leafReaderContext.reader();
      FieldInfos fieldInfos = leafReader.getFieldInfos();
      for (FieldInfo info : fieldInfos) {
        NumericDocValues norms = leafReader.getNormValues(info.name);
        if (norms == null) {
          continue;
        }
        Map perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
        SummaryStatistics lengthSummary =
            (SummaryStatistics)
                perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
        while (norms.advance(norms.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
          for (int i = 0; i < samplingStep; i++) {
            lengthSummary.addValue(8);
          }
        }
      }
    }
    result.put(NORMS, stats);
  }

  private void estimatePoints(Map result) throws IOException {
    log.info("- estimating points...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext leafReaderContext : reader.leaves()) {
      LeafReader leafReader = leafReaderContext.reader();
      FieldInfos fieldInfos = leafReader.getFieldInfos();
      for (FieldInfo info : fieldInfos) {
        PointValues values = leafReader.getPointValues(info.name);
        if (values == null) {
          continue;
        }
        Map perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
        SummaryStatistics lengthSummary =
            (SummaryStatistics)
                perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
        lengthSummary.addValue(
            (double)
                (values.size() * values.getBytesPerDimension() * values.getNumIndexDimensions()));
      }
    }
    result.put(POINTS, stats);
  }

  private void estimateTermVectors(Map result) throws IOException {
    log.info("- estimating term vectors...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext leafReaderContext : reader.leaves()) {
      LeafReader leafReader = leafReaderContext.reader();
      Bits liveDocs = leafReader.getLiveDocs();
      for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
        if (liveDocs != null && !liveDocs.get(docId)) {
          continue;
        }
        Fields termVectors = leafReader.getTermVectors(docId);
        if (termVectors == null) {
          continue;
        }
        for (String field : termVectors) {
          Terms terms = termVectors.terms(field);
          if (terms == null) {
            continue;
          }
          estimateTermStats(field, terms, stats, true);
        }
      }
    }
    result.put(TERM_VECTORS, stats);
  }

  private void estimateDocValues(Map result) throws IOException {
    log.info("- estimating docValues...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext context : reader.leaves()) {
      LeafReader leafReader = context.reader();
      FieldInfos fieldInfos = leafReader.getFieldInfos();
      for (FieldInfo info : fieldInfos) {
        // binary
        countDocValues(
            stats,
            info.name,
            "binary",
            leafReader.getBinaryDocValues(info.name),
            values -> {
              try {
                BytesRef value = ((BinaryDocValues) values).binaryValue();
                return value.length;
              } catch (IOException e) {
                // ignore
              }
              return 0;
            });
        // numeric
        countDocValues(
            stats, info.name, "numeric", leafReader.getNumericDocValues(info.name), values -> 8);
        countDocValues(
            stats,
            info.name,
            "sorted",
            leafReader.getSortedDocValues(info.name),
            values -> {
              try {
                TermsEnum termsEnum = ((SortedDocValues) values).termsEnum();
                BytesRef term;
                while ((term = termsEnum.next()) != null) {
                  return term.length;
                }
              } catch (IOException e) {
                // ignore
              }
              return 0;
            });
        countDocValues(
            stats,
            info.name,
            "sortedNumeric",
            leafReader.getSortedNumericDocValues(info.name),
            values -> ((SortedNumericDocValues) values).docValueCount() * 8);
        countDocValues(
            stats,
            info.name,
            "sortedSet",
            leafReader.getSortedSetDocValues(info.name),
            values -> {
              try {
                TermsEnum termsEnum = ((SortedSetDocValues) values).termsEnum();
                BytesRef term;
                while ((term = termsEnum.next()) != null) {
                  return term.length;
                }
              } catch (IOException e) {
                // ignore
              }
              return 0;
            });
      }
    }
    result.put(DOC_VALUES, stats);
  }

  private void countDocValues(
      Map> stats,
      String field,
      String type,
      DocIdSetIterator values,
      Function valueLength)
      throws IOException {
    if (values == null) {
      return;
    }
    Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
    SummaryStatistics lengthSummary =
        (SummaryStatistics)
            perField.computeIfAbsent("lengths_" + type, s -> new MapWriterSummaryStatistics());
    while (values.advance(values.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
      int len = valueLength.apply(values);
      for (int i = 0; i < samplingStep; i++) {
        lengthSummary.addValue(len);
      }
    }
  }

  private void estimateTerms(Map result) throws IOException {
    log.info("- estimating terms...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext context : reader.leaves()) {
      LeafReader leafReader = context.reader();
      FieldInfos fieldInfos = leafReader.getFieldInfos();
      for (FieldInfo info : fieldInfos) {
        Terms terms = leafReader.terms(info.name);
        if (terms == null) {
          continue;
        }
        estimateTermStats(info.name, terms, stats, false);
      }
    }
    result.put(TERMS, stats);
  }

  private void estimateTermStats(
      String field, Terms terms, Map> stats, boolean isSampling)
      throws IOException {
    Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
    SummaryStatistics lengthSummary =
        (SummaryStatistics)
            perField.computeIfAbsent("lengths_terms", s -> new MapWriterSummaryStatistics());
    SummaryStatistics docFreqSummary =
        (SummaryStatistics)
            perField.computeIfAbsent("docFreqs", s -> new MapWriterSummaryStatistics());
    SummaryStatistics totalFreqSummary =
        (SummaryStatistics)
            perField.computeIfAbsent("lengths_postings", s -> new MapWriterSummaryStatistics());
    // TODO: add this at some point
    // SummaryStatistics impactsSummary =
    // (SummaryStatistics)perField.computeIfAbsent("lengths_impacts", s -> new
    // MapWriterSummaryStatistics());
    SummaryStatistics payloadSummary = null;
    if (terms.hasPayloads()) {
      payloadSummary =
          (SummaryStatistics)
              perField.computeIfAbsent("lengths_payloads", s -> new MapWriterSummaryStatistics());
    }
    ItemPriorityQueue topLen =
        (ItemPriorityQueue) perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
    ItemPriorityQueue topTotalFreq =
        (ItemPriorityQueue)
            perField.computeIfAbsent("topTotalFreq", s -> new ItemPriorityQueue(topN));
    TermsEnum termsEnum = terms.iterator();
    BytesRef term;
    PostingsEnum postings = null;
    while ((term = termsEnum.next()) != null) {
      if (isSampling) {
        for (int i = 0; i < samplingStep; i++) {
          lengthSummary.addValue(term.length);
          docFreqSummary.addValue(termsEnum.docFreq());
          totalFreqSummary.addValue((double) termsEnum.totalTermFreq());
        }
      } else {
        lengthSummary.addValue(term.length);
        docFreqSummary.addValue(termsEnum.docFreq());
        totalFreqSummary.addValue((double) termsEnum.totalTermFreq());
      }
      if (terms.hasPayloads()) {
        postings = termsEnum.postings(postings, PostingsEnum.ALL);
        while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          int freq = postings.freq();
          for (int i = 0; i < freq; i++) {
            if (postings.nextPosition() < 0) {
              break;
            }
            BytesRef payload = postings.getPayload();
            if (payload != null) {
              if (isSampling) {
                for (int k = 0; k < samplingStep; k++) {
                  payloadSummary.addValue(payload.length);
                }
              } else {
                payloadSummary.addValue(payload.length);
              }
            }
          }
        }
      }
      String value = term.utf8ToString();
      if (value.length() > maxLength) {
        value = value.substring(0, maxLength);
      }
      topLen.insertWithOverflow(new Item(value, term.length));
      topTotalFreq.insertWithOverflow(new Item(value, termsEnum.totalTermFreq()));
    }
  }

  private void estimateStoredFields(Map result) throws IOException {
    log.info("- estimating stored fields...");
    Map> stats = new HashMap<>();
    for (LeafReaderContext context : reader.leaves()) {
      LeafReader leafReader = context.reader();
      EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
      Bits liveDocs = leafReader.getLiveDocs();
      if (leafReader instanceof CodecReader) {
        CodecReader codecReader = (CodecReader) leafReader;
        StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
        // this instance may be faster for a full sequential pass
        StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance();
        for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
          if (liveDocs != null && !liveDocs.get(docId)) {
            continue;
          }
          mergeInstance.document(docId, visitor);
        }
        if (mergeInstance != storedFieldsReader) {
          mergeInstance.close();
        }
      } else {
        for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
          if (liveDocs != null && !liveDocs.get(docId)) {
            continue;
          }
          leafReader.document(docId, visitor);
        }
      }
    }
    result.put(STORED_FIELDS, stats);
  }

  public static class Item {
    Object value;
    long size;

    public Item(Object value, long size) {
      this.value = value;
      this.size = size;
    }

    @Override
    public String toString() {
      return "size=" + size + ", value=" + value;
    }
  }

  public static class MapWriterSummaryStatistics extends SummaryStatistics implements MapWriter {

    @Override
    public void writeMap(EntryWriter ew) throws IOException {
      ew.put("n", getN());
      ew.put("min", getMin());
      ew.put("max", getMax());
      ew.put("sum", getSum());
      ew.put("mean", getMean());
      ew.put("geoMean", getGeometricMean());
      ew.put("variance", getVariance());
      ew.put("populationVariance", getPopulationVariance());
      ew.put("stddev", getStandardDeviation());
      ew.put("secondMoment", getSecondMoment());
      ew.put("sumOfSquares", getSumsq());
      ew.put("sumOfLogs", getSumOfLogs());
    }
  }

  public static class ItemPriorityQueue extends PriorityQueue implements MapWriter {

    public ItemPriorityQueue(int maxSize) {
      super(maxSize);
    }

    @Override
    protected boolean lessThan(Item a, Item b) {
      return a.size < b.size;
    }

    @Override
    public String toString() {
      StringBuilder sb = new StringBuilder();
      Iterator it = iterator();
      while (it.hasNext()) {
        if (sb.length() > 0) {
          sb.append('\n');
        }
        sb.append(it.next());
      }
      return sb.toString();
    }

    // WARNING: destructive! empties the queue
    @Override
    public void writeMap(EntryWriter ew) throws IOException {
      Item[] items = new Item[size()];
      int pos = size() - 1;
      while (size() > 0) {
        items[pos] = pop();
        pos--;
      }
      for (Item item : items) {
        ew.put(String.valueOf(item.value), item.size);
      }
    }
  }

  private static class EstimatingVisitor extends StoredFieldVisitor {
    final Map> stats;
    final int topN;
    final int maxLength;
    final int samplingStep;

    EstimatingVisitor(
        Map> stats, int topN, int maxLength, int samplingStep) {
      this.stats = stats;
      this.topN = topN;
      this.maxLength = maxLength;
      this.samplingStep = samplingStep;
    }

    /**
     * Process a binary field.
     *
     * @param value newly allocated byte array with the binary contents.
     */
    @Override
    public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
      // trim the value if needed
      int len = value != null ? value.length : 0;
      if (len > maxLength) {
        byte[] newValue = new byte[maxLength];
        System.arraycopy(value, 0, newValue, 0, maxLength);
        value = newValue;
      }
      String strValue = new BytesRef(value).toString();
      countItem(fieldInfo.name, strValue, len);
    }

    /** Process a string field. */
    @Override
    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
      // trim the value if needed
      int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
      if (value.length() > maxLength) {
        value = value.substring(0, maxLength);
      }
      countItem(fieldInfo.name, value, len);
    }

    /** Process a int numeric field. */
    @Override
    public void intField(FieldInfo fieldInfo, int value) throws IOException {
      countItem(fieldInfo.name, String.valueOf(value), 4);
    }

    /** Process a long numeric field. */
    @Override
    public void longField(FieldInfo fieldInfo, long value) throws IOException {
      countItem(fieldInfo.name, String.valueOf(value), 8);
    }

    /** Process a float numeric field. */
    @Override
    public void floatField(FieldInfo fieldInfo, float value) throws IOException {
      countItem(fieldInfo.name, String.valueOf(value), 4);
    }

    /** Process a double numeric field. */
    @Override
    public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
      countItem(fieldInfo.name, String.valueOf(value), 8);
    }

    private void countItem(String field, Object value, int size) {
      Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
      SummaryStatistics summary =
          (SummaryStatistics)
              perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
      for (int i = 0; i < samplingStep; i++) {
        summary.addValue(size);
      }
      ItemPriorityQueue topNqueue =
          (ItemPriorityQueue) perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
      topNqueue.insertWithOverflow(new Item(value, size));
    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      return Status.YES;
    }
  }

  @SuppressForbidden(reason = "System.err and System.out required for a command-line utility")
  public static void main(String[] args) throws Exception {
    if (args.length == 0) {
      System.err.println(
          "Usage: "
              + IndexSizeEstimator.class.getName()
              + " [-topN NUM] [-maxLen NUM] [-summary] [-details] ");
      System.err.println();
      System.err.println("\t\tpath to the index (parent path of 'segments_N' file)");
      System.err.println("\t-topN NUM\tnumber of top largest items to collect");
      System.err.println("\t-maxLen NUM\ttruncate the largest items to NUM bytes / characters");
      System.err.println(-1);
    }
    String path = null;
    int topN = 20;
    int maxLen = 100;
    boolean details = false;
    boolean summary = false;
    for (int i = 0; i < args.length; i++) {
      if (args[i].equals("-topN")) {
        topN = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-maxLen")) {
        maxLen = Integer.parseInt(args[++i]);
      } else if (args[i].equals("-details")) {
        details = true;
      } else if (args[i].equals("-summary")) {
        summary = true;
      } else {
        path = args[i];
      }
    }
    if (path == null) {
      System.err.println("ERROR:  argument is required.");
      System.exit(-2);
    }
    Directory dir = FSDirectory.open(Paths.get(path));
    DirectoryReader reader = StandardDirectoryReader.open(dir);
    IndexSizeEstimator stats = new IndexSizeEstimator(reader, topN, maxLen, summary, details);
    System.out.println(Utils.toJSONString(stats.estimate()));
    System.exit(0);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy