org.apache.solr.handler.admin.IndexSizeEstimator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Show all versions of solr-core Show documentation
Apache Solr (module: core)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.admin;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.lucene.codecs.StoredFieldsReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StandardDirectoryReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.SuppressForbidden;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.solr.common.MapWriter;
import org.apache.solr.common.util.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Estimates the raw size of all uncompressed indexed data by scanning term, docValues and stored
* fields data. This utility also provides detailed statistics about term, docValues, postings and
* stored fields distributions.
*/
public class IndexSizeEstimator {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
public static final String TERMS = "terms";
public static final String STORED_FIELDS = "storedFields";
public static final String NORMS = "norms";
public static final String DOC_VALUES = "docValues";
public static final String POINTS = "points";
public static final String TERM_VECTORS = "termVectors";
public static final String SUMMARY = "summary";
public static final String DETAILS = "details";
public static final String FIELDS_BY_SIZE = "fieldsBySize";
public static final String TYPES_BY_SIZE = "typesBySize";
public static final int DEFAULT_SAMPLING_THRESHOLD = 100_000;
public static final float DEFAULT_SAMPLING_PERCENT = 5.0f;
private final IndexReader reader;
private final int topN;
private final int maxLength;
private final boolean withSummary;
private final boolean withDetails;
private int samplingThreshold = DEFAULT_SAMPLING_THRESHOLD;
private float samplingPercent = DEFAULT_SAMPLING_PERCENT;
private int samplingStep = 1;
public static final class Estimate implements MapWriter {
private final Map fieldsBySize;
private final Map typesBySize;
private final Map summary;
private final Map details;
public Estimate(
Map fieldsBySize,
Map typesBySize,
Map summary,
Map details) {
Objects.requireNonNull(fieldsBySize);
Objects.requireNonNull(typesBySize);
this.fieldsBySize = fieldsBySize;
this.typesBySize = typesBySize;
this.summary = summary;
this.details = details;
}
public Map getFieldsBySize() {
return fieldsBySize;
}
public Map getTypesBySize() {
return typesBySize;
}
public Map getHumanReadableFieldsBySize() {
LinkedHashMap result = new LinkedHashMap<>();
fieldsBySize.forEach(
(field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
return result;
}
public Map getHumanReadableTypesBySize() {
LinkedHashMap result = new LinkedHashMap<>();
typesBySize.forEach(
(field, size) -> result.put(field, RamUsageEstimator.humanReadableUnits(size)));
return result;
}
public Map getSummary() {
return summary;
}
public Map getDetails() {
return details;
}
@Override
public void writeMap(EntryWriter ew) throws IOException {
ew.put(FIELDS_BY_SIZE, fieldsBySize);
ew.put(TYPES_BY_SIZE, typesBySize);
if (summary != null) {
ew.put(SUMMARY, summary);
}
if (details != null) {
ew.put(DETAILS, details);
}
}
}
public IndexSizeEstimator(
IndexReader reader, int topN, int maxLength, boolean withSummary, boolean withDetails) {
this.reader = reader;
this.topN = topN;
this.maxLength = maxLength;
this.withSummary = withSummary;
this.withDetails = withDetails;
}
/**
* Set the sampling threshold. If the index has more documents than this threshold then only some
* values will be sampled and the totals will be extrapolated.
*
* @param threshold size threshold (number of documents). Default value is {@link
* #DEFAULT_SAMPLING_THRESHOLD}. Setting this to values <= 0 means no threshold (and no
* sampling).
*/
public void setSamplingThreshold(int threshold) {
if (threshold <= 0) {
threshold = Integer.MAX_VALUE;
}
this.samplingThreshold = threshold;
}
/**
* Sampling percent (a number greater than 0 and less or equal to 100). When index size exceeds
* the threshold then approximately only this percent of data will be retrieved from the index and
* the totals will be extrapolated.
*
* @param percent sample percent. Default value is {@link #DEFAULT_SAMPLING_PERCENT}.
* @throws IllegalArgumentException when value is less than or equal to 0.0 or greater than 100.0,
* or the sampling percent is so small that less than 10 documents would be sampled.
*/
public void setSamplingPercent(float percent) throws IllegalArgumentException {
if (percent <= 0 || percent > 100) {
throw new IllegalArgumentException("samplingPercent must be 0 < percent <= 100");
}
if (reader.maxDoc() > samplingThreshold) {
samplingStep = Math.round(100.0f / samplingPercent);
if (log.isInfoEnabled()) {
log.info(
"- number of documents {} larger than {}, sampling percent is {} and sampling step {}",
reader.maxDoc(),
samplingThreshold,
samplingPercent,
samplingStep);
}
if (reader.maxDoc() / samplingStep < 10) {
throw new IllegalArgumentException(
"Out of "
+ reader.maxDoc()
+ " less than 10 documents would be sampled, which is too unreliable. Increase the samplingPercent.");
}
}
this.samplingPercent = percent;
}
@SuppressWarnings({"unchecked"})
public Estimate estimate() throws Exception {
Map details = new LinkedHashMap<>();
Map summary = new LinkedHashMap<>();
estimateStoredFields(details);
estimateTerms(details);
estimateNorms(details);
estimatePoints(details);
estimateTermVectors(details);
estimateDocValues(details);
estimateSummary(details, summary);
if (samplingStep > 1) {
details.put("samplingPercent", samplingPercent);
details.put("samplingStep", samplingStep);
}
ItemPriorityQueue fieldSizeQueue = new ItemPriorityQueue(summary.size());
summary.forEach(
(field, perField) -> {
long size = ((AtomicLong) ((Map) perField).get("totalSize")).get();
if (size > 0) {
fieldSizeQueue.insertWithOverflow(new Item(field, size));
}
});
Map fieldsBySize = new LinkedHashMap<>();
fieldSizeQueue._forEachEntry((k, v) -> fieldsBySize.put((String) k, (Long) v));
Map typeSizes = new HashMap<>();
summary.forEach(
(field, perField) -> {
Map perType =
(Map) ((Map) perField).get("perType");
perType.forEach(
(type, size) -> {
if (type.contains("_lengths")) {
AtomicLong totalSize =
typeSizes.computeIfAbsent(
type.replace("_lengths", ""), t -> new AtomicLong());
totalSize.addAndGet(((AtomicLong) size).get());
}
});
});
ItemPriorityQueue typesSizeQueue = new ItemPriorityQueue(typeSizes.size());
typeSizes.forEach(
(type, size) -> {
if (size.get() > 0) {
typesSizeQueue.insertWithOverflow(new Item(type, size.get()));
}
});
Map typesBySize = new LinkedHashMap<>();
typesSizeQueue._forEachEntry((k, v) -> typesBySize.put((String) k, (Long) v));
// sort summary by field size
Map newSummary = new LinkedHashMap<>();
fieldsBySize.keySet().forEach(k -> newSummary.put(String.valueOf(k), summary.get(k)));
// convert everything to maps and primitives
convert(newSummary);
convert(details);
return new Estimate(
fieldsBySize, typesBySize, withSummary ? newSummary : null, withDetails ? details : null);
}
@SuppressWarnings({"unchecked"})
private void convert(Map result) {
for (Map.Entry entry : result.entrySet()) {
Object value = entry.getValue();
if (value instanceof ItemPriorityQueue) {
ItemPriorityQueue queue = (ItemPriorityQueue) value;
Map map = new LinkedHashMap<>();
queue.toMap(map);
entry.setValue(map);
} else if (value instanceof MapWriterSummaryStatistics) {
MapWriterSummaryStatistics stats = (MapWriterSummaryStatistics) value;
Map map = new LinkedHashMap<>();
stats.toMap(map);
entry.setValue(map);
} else if (value instanceof AtomicLong) {
entry.setValue(((AtomicLong) value).longValue());
} else if (value instanceof Map) {
// recurse
convert((Map) value);
}
}
}
@SuppressWarnings({"unchecked"})
private void estimateSummary(Map details, Map summary) {
log.info("- preparing summary...");
details.forEach(
(type, perType) -> {
((Map) perType)
.forEach(
(field, perField) -> {
Map perFieldSummary =
(Map) summary.computeIfAbsent(field, f -> new HashMap<>());
((Map) perField)
.forEach(
(k, val) -> {
if (val instanceof SummaryStatistics) {
SummaryStatistics stats = (SummaryStatistics) val;
if (k.startsWith("lengths")) {
AtomicLong total =
(AtomicLong)
perFieldSummary.computeIfAbsent(
"totalSize", kt -> new AtomicLong());
total.addAndGet((long) stats.getSum());
}
Map perTypeSummary =
(Map)
perFieldSummary.computeIfAbsent(
"perType", pt -> new HashMap<>());
AtomicLong total =
(AtomicLong)
perTypeSummary.computeIfAbsent(
type + "_" + k, t -> new AtomicLong());
total.addAndGet((long) stats.getSum());
}
});
});
});
}
private void estimateNorms(Map result) throws IOException {
log.info("- estimating norms...");
Map> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
NumericDocValues norms = leafReader.getNormValues(info.name);
if (norms == null) {
continue;
}
Map perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
SummaryStatistics lengthSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
while (norms.advance(norms.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
for (int i = 0; i < samplingStep; i++) {
lengthSummary.addValue(8);
}
}
}
}
result.put(NORMS, stats);
}
private void estimatePoints(Map result) throws IOException {
log.info("- estimating points...");
Map> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
PointValues values = leafReader.getPointValues(info.name);
if (values == null) {
continue;
}
Map perField = stats.computeIfAbsent(info.name, n -> new HashMap<>());
SummaryStatistics lengthSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
lengthSummary.addValue(
(double)
(values.size() * values.getBytesPerDimension() * values.getNumIndexDimensions()));
}
}
result.put(POINTS, stats);
}
private void estimateTermVectors(Map result) throws IOException {
log.info("- estimating term vectors...");
Map> stats = new HashMap<>();
for (LeafReaderContext leafReaderContext : reader.leaves()) {
LeafReader leafReader = leafReaderContext.reader();
Bits liveDocs = leafReader.getLiveDocs();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
Fields termVectors = leafReader.getTermVectors(docId);
if (termVectors == null) {
continue;
}
for (String field : termVectors) {
Terms terms = termVectors.terms(field);
if (terms == null) {
continue;
}
estimateTermStats(field, terms, stats, true);
}
}
}
result.put(TERM_VECTORS, stats);
}
private void estimateDocValues(Map result) throws IOException {
log.info("- estimating docValues...");
Map> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
// binary
countDocValues(
stats,
info.name,
"binary",
leafReader.getBinaryDocValues(info.name),
values -> {
try {
BytesRef value = ((BinaryDocValues) values).binaryValue();
return value.length;
} catch (IOException e) {
// ignore
}
return 0;
});
// numeric
countDocValues(
stats, info.name, "numeric", leafReader.getNumericDocValues(info.name), values -> 8);
countDocValues(
stats,
info.name,
"sorted",
leafReader.getSortedDocValues(info.name),
values -> {
try {
TermsEnum termsEnum = ((SortedDocValues) values).termsEnum();
BytesRef term;
while ((term = termsEnum.next()) != null) {
return term.length;
}
} catch (IOException e) {
// ignore
}
return 0;
});
countDocValues(
stats,
info.name,
"sortedNumeric",
leafReader.getSortedNumericDocValues(info.name),
values -> ((SortedNumericDocValues) values).docValueCount() * 8);
countDocValues(
stats,
info.name,
"sortedSet",
leafReader.getSortedSetDocValues(info.name),
values -> {
try {
TermsEnum termsEnum = ((SortedSetDocValues) values).termsEnum();
BytesRef term;
while ((term = termsEnum.next()) != null) {
return term.length;
}
} catch (IOException e) {
// ignore
}
return 0;
});
}
}
result.put(DOC_VALUES, stats);
}
private void countDocValues(
Map> stats,
String field,
String type,
DocIdSetIterator values,
Function valueLength)
throws IOException {
if (values == null) {
return;
}
Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics lengthSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths_" + type, s -> new MapWriterSummaryStatistics());
while (values.advance(values.docID() + samplingStep) != DocIdSetIterator.NO_MORE_DOCS) {
int len = valueLength.apply(values);
for (int i = 0; i < samplingStep; i++) {
lengthSummary.addValue(len);
}
}
}
private void estimateTerms(Map result) throws IOException {
log.info("- estimating terms...");
Map> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
FieldInfos fieldInfos = leafReader.getFieldInfos();
for (FieldInfo info : fieldInfos) {
Terms terms = leafReader.terms(info.name);
if (terms == null) {
continue;
}
estimateTermStats(info.name, terms, stats, false);
}
}
result.put(TERMS, stats);
}
private void estimateTermStats(
String field, Terms terms, Map> stats, boolean isSampling)
throws IOException {
Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics lengthSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths_terms", s -> new MapWriterSummaryStatistics());
SummaryStatistics docFreqSummary =
(SummaryStatistics)
perField.computeIfAbsent("docFreqs", s -> new MapWriterSummaryStatistics());
SummaryStatistics totalFreqSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths_postings", s -> new MapWriterSummaryStatistics());
// TODO: add this at some point
// SummaryStatistics impactsSummary =
// (SummaryStatistics)perField.computeIfAbsent("lengths_impacts", s -> new
// MapWriterSummaryStatistics());
SummaryStatistics payloadSummary = null;
if (terms.hasPayloads()) {
payloadSummary =
(SummaryStatistics)
perField.computeIfAbsent("lengths_payloads", s -> new MapWriterSummaryStatistics());
}
ItemPriorityQueue topLen =
(ItemPriorityQueue) perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
ItemPriorityQueue topTotalFreq =
(ItemPriorityQueue)
perField.computeIfAbsent("topTotalFreq", s -> new ItemPriorityQueue(topN));
TermsEnum termsEnum = terms.iterator();
BytesRef term;
PostingsEnum postings = null;
while ((term = termsEnum.next()) != null) {
if (isSampling) {
for (int i = 0; i < samplingStep; i++) {
lengthSummary.addValue(term.length);
docFreqSummary.addValue(termsEnum.docFreq());
totalFreqSummary.addValue((double) termsEnum.totalTermFreq());
}
} else {
lengthSummary.addValue(term.length);
docFreqSummary.addValue(termsEnum.docFreq());
totalFreqSummary.addValue((double) termsEnum.totalTermFreq());
}
if (terms.hasPayloads()) {
postings = termsEnum.postings(postings, PostingsEnum.ALL);
while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int freq = postings.freq();
for (int i = 0; i < freq; i++) {
if (postings.nextPosition() < 0) {
break;
}
BytesRef payload = postings.getPayload();
if (payload != null) {
if (isSampling) {
for (int k = 0; k < samplingStep; k++) {
payloadSummary.addValue(payload.length);
}
} else {
payloadSummary.addValue(payload.length);
}
}
}
}
}
String value = term.utf8ToString();
if (value.length() > maxLength) {
value = value.substring(0, maxLength);
}
topLen.insertWithOverflow(new Item(value, term.length));
topTotalFreq.insertWithOverflow(new Item(value, termsEnum.totalTermFreq()));
}
}
private void estimateStoredFields(Map result) throws IOException {
log.info("- estimating stored fields...");
Map> stats = new HashMap<>();
for (LeafReaderContext context : reader.leaves()) {
LeafReader leafReader = context.reader();
EstimatingVisitor visitor = new EstimatingVisitor(stats, topN, maxLength, samplingStep);
Bits liveDocs = leafReader.getLiveDocs();
if (leafReader instanceof CodecReader) {
CodecReader codecReader = (CodecReader) leafReader;
StoredFieldsReader storedFieldsReader = codecReader.getFieldsReader();
// this instance may be faster for a full sequential pass
StoredFieldsReader mergeInstance = storedFieldsReader.getMergeInstance();
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
mergeInstance.document(docId, visitor);
}
if (mergeInstance != storedFieldsReader) {
mergeInstance.close();
}
} else {
for (int docId = 0; docId < leafReader.maxDoc(); docId += samplingStep) {
if (liveDocs != null && !liveDocs.get(docId)) {
continue;
}
leafReader.document(docId, visitor);
}
}
}
result.put(STORED_FIELDS, stats);
}
public static class Item {
Object value;
long size;
public Item(Object value, long size) {
this.value = value;
this.size = size;
}
@Override
public String toString() {
return "size=" + size + ", value=" + value;
}
}
public static class MapWriterSummaryStatistics extends SummaryStatistics implements MapWriter {
@Override
public void writeMap(EntryWriter ew) throws IOException {
ew.put("n", getN());
ew.put("min", getMin());
ew.put("max", getMax());
ew.put("sum", getSum());
ew.put("mean", getMean());
ew.put("geoMean", getGeometricMean());
ew.put("variance", getVariance());
ew.put("populationVariance", getPopulationVariance());
ew.put("stddev", getStandardDeviation());
ew.put("secondMoment", getSecondMoment());
ew.put("sumOfSquares", getSumsq());
ew.put("sumOfLogs", getSumOfLogs());
}
}
public static class ItemPriorityQueue extends PriorityQueue- implements MapWriter {
public ItemPriorityQueue(int maxSize) {
super(maxSize);
}
@Override
protected boolean lessThan(Item a, Item b) {
return a.size < b.size;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Iterator
- it = iterator();
while (it.hasNext()) {
if (sb.length() > 0) {
sb.append('\n');
}
sb.append(it.next());
}
return sb.toString();
}
// WARNING: destructive! empties the queue
@Override
public void writeMap(EntryWriter ew) throws IOException {
Item[] items = new Item[size()];
int pos = size() - 1;
while (size() > 0) {
items[pos] = pop();
pos--;
}
for (Item item : items) {
ew.put(String.valueOf(item.value), item.size);
}
}
}
private static class EstimatingVisitor extends StoredFieldVisitor {
final Map
> stats;
final int topN;
final int maxLength;
final int samplingStep;
EstimatingVisitor(
Map> stats, int topN, int maxLength, int samplingStep) {
this.stats = stats;
this.topN = topN;
this.maxLength = maxLength;
this.samplingStep = samplingStep;
}
/**
* Process a binary field.
*
* @param value newly allocated byte array with the binary contents.
*/
@Override
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
// trim the value if needed
int len = value != null ? value.length : 0;
if (len > maxLength) {
byte[] newValue = new byte[maxLength];
System.arraycopy(value, 0, newValue, 0, maxLength);
value = newValue;
}
String strValue = new BytesRef(value).toString();
countItem(fieldInfo.name, strValue, len);
}
/** Process a string field. */
@Override
public void stringField(FieldInfo fieldInfo, String value) throws IOException {
// trim the value if needed
int len = value != null ? UnicodeUtil.calcUTF16toUTF8Length(value, 0, value.length()) : 0;
if (value.length() > maxLength) {
value = value.substring(0, maxLength);
}
countItem(fieldInfo.name, value, len);
}
/** Process a int numeric field. */
@Override
public void intField(FieldInfo fieldInfo, int value) throws IOException {
countItem(fieldInfo.name, String.valueOf(value), 4);
}
/** Process a long numeric field. */
@Override
public void longField(FieldInfo fieldInfo, long value) throws IOException {
countItem(fieldInfo.name, String.valueOf(value), 8);
}
/** Process a float numeric field. */
@Override
public void floatField(FieldInfo fieldInfo, float value) throws IOException {
countItem(fieldInfo.name, String.valueOf(value), 4);
}
/** Process a double numeric field. */
@Override
public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
countItem(fieldInfo.name, String.valueOf(value), 8);
}
private void countItem(String field, Object value, int size) {
Map perField = stats.computeIfAbsent(field, n -> new HashMap<>());
SummaryStatistics summary =
(SummaryStatistics)
perField.computeIfAbsent("lengths", s -> new MapWriterSummaryStatistics());
for (int i = 0; i < samplingStep; i++) {
summary.addValue(size);
}
ItemPriorityQueue topNqueue =
(ItemPriorityQueue) perField.computeIfAbsent("topLen", s -> new ItemPriorityQueue(topN));
topNqueue.insertWithOverflow(new Item(value, size));
}
@Override
public Status needsField(FieldInfo fieldInfo) throws IOException {
return Status.YES;
}
}
@SuppressForbidden(reason = "System.err and System.out required for a command-line utility")
public static void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println(
"Usage: "
+ IndexSizeEstimator.class.getName()
+ " [-topN NUM] [-maxLen NUM] [-summary] [-details] ");
System.err.println();
System.err.println("\t\tpath to the index (parent path of 'segments_N' file)");
System.err.println("\t-topN NUM\tnumber of top largest items to collect");
System.err.println("\t-maxLen NUM\ttruncate the largest items to NUM bytes / characters");
System.err.println(-1);
}
String path = null;
int topN = 20;
int maxLen = 100;
boolean details = false;
boolean summary = false;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-topN")) {
topN = Integer.parseInt(args[++i]);
} else if (args[i].equals("-maxLen")) {
maxLen = Integer.parseInt(args[++i]);
} else if (args[i].equals("-details")) {
details = true;
} else if (args[i].equals("-summary")) {
summary = true;
} else {
path = args[i];
}
}
if (path == null) {
System.err.println("ERROR: argument is required.");
System.exit(-2);
}
Directory dir = FSDirectory.open(Paths.get(path));
DirectoryReader reader = StandardDirectoryReader.open(dir);
IndexSizeEstimator stats = new IndexSizeEstimator(reader, topN, maxLen, summary, details);
System.out.println(Utils.toJSONString(stats.estimate()));
System.exit(0);
}
}