org.apache.lucene.codecs.lucene45.Lucene45DocValuesConsumer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.codecs.lucene45;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Closeable; // javadocs
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.BlockPackedWriter;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;
/** writer for {@link Lucene45DocValuesFormat} */
public class Lucene45DocValuesConsumer extends DocValuesConsumer implements Closeable {
static final int BLOCK_SIZE = 16384;
static final int ADDRESS_INTERVAL = 16;
static final Number MISSING_ORD = Long.valueOf(-1);
/** Compressed using packed blocks of ints. */
public static final int DELTA_COMPRESSED = 0;
/** Compressed by computing the GCD. */
public static final int GCD_COMPRESSED = 1;
/** Compressed by giving IDs to unique values. */
public static final int TABLE_COMPRESSED = 2;
/** Uncompressed binary, written directly (fixed length). */
public static final int BINARY_FIXED_UNCOMPRESSED = 0;
/** Uncompressed binary, written directly (variable length). */
public static final int BINARY_VARIABLE_UNCOMPRESSED = 1;
/** Compressed binary with shared prefixes */
public static final int BINARY_PREFIX_COMPRESSED = 2;
/** Standard storage for sorted set values with 1 level of indirection:
* docId -> address -> ord. */
public static final int SORTED_SET_WITH_ADDRESSES = 0;
/** Single-valued sorted set values, encoded as sorted values, so no level
* of indirection: docId -> ord. */
public static final int SORTED_SET_SINGLE_VALUED_SORTED = 1;
final IndexOutput data, meta;
final int maxDoc;
/** expert: Creates a new writer */
public Lucene45DocValuesConsumer(SegmentWriteState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
boolean success = false;
try {
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
data = state.directory.createOutput(dataName, state.context);
CodecUtil.writeHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
meta = state.directory.createOutput(metaName, state.context);
CodecUtil.writeHeader(meta, metaCodec, Lucene45DocValuesFormat.VERSION_CURRENT);
maxDoc = state.segmentInfo.getDocCount();
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
@Override
public void addNumericField(FieldInfo field, Iterable values) throws IOException {
addNumericField(field, values, true);
}
void addNumericField(FieldInfo field, Iterable values, boolean optimizeStorage) throws IOException {
long count = 0;
long minValue = Long.MAX_VALUE;
long maxValue = Long.MIN_VALUE;
long gcd = 0;
boolean missing = false;
// TODO: more efficient?
HashSet uniqueValues = null;
if (optimizeStorage) {
uniqueValues = new HashSet();
for (Number nv : values) {
final long v;
if (nv == null) {
v = 0;
missing = true;
} else {
v = nv.longValue();
}
if (gcd != 1) {
if (v < Long.MIN_VALUE / 2 || v > Long.MAX_VALUE / 2) {
// in that case v - minValue might overflow and make the GCD computation return
// wrong results. Since these extreme values are unlikely, we just discard
// GCD computation for them
gcd = 1;
} else if (count != 0) { // minValue needs to be set first
gcd = MathUtil.gcd(gcd, v - minValue);
}
}
minValue = Math.min(minValue, v);
maxValue = Math.max(maxValue, v);
if (uniqueValues != null) {
if (uniqueValues.add(v)) {
if (uniqueValues.size() > 256) {
uniqueValues = null;
}
}
}
++count;
}
} else {
for (@SuppressWarnings("unused") Number nv : values) {
++count;
}
}
final long delta = maxValue - minValue;
final int format;
if (uniqueValues != null
&& (delta < 0L || PackedInts.bitsRequired(uniqueValues.size() - 1) < PackedInts.bitsRequired(delta))
&& count <= Integer.MAX_VALUE) {
format = TABLE_COMPRESSED;
} else if (gcd != 0 && gcd != 1) {
format = GCD_COMPRESSED;
} else {
format = DELTA_COMPRESSED;
}
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
meta.writeVInt(format);
if (missing) {
meta.writeLong(data.getFilePointer());
writeMissingBitset(values);
} else {
meta.writeLong(-1L);
}
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeLong(data.getFilePointer());
meta.writeVLong(count);
meta.writeVInt(BLOCK_SIZE);
switch (format) {
case GCD_COMPRESSED:
meta.writeLong(minValue);
meta.writeLong(gcd);
final BlockPackedWriter quotientWriter = new BlockPackedWriter(data, BLOCK_SIZE);
for (Number nv : values) {
long value = nv == null ? 0 : nv.longValue();
quotientWriter.add((value - minValue) / gcd);
}
quotientWriter.finish();
break;
case DELTA_COMPRESSED:
final BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
for (Number nv : values) {
writer.add(nv == null ? 0 : nv.longValue());
}
writer.finish();
break;
case TABLE_COMPRESSED:
final Long[] decode = uniqueValues.toArray(new Long[uniqueValues.size()]);
final HashMap encode = new HashMap();
meta.writeVInt(decode.length);
for (int i = 0; i < decode.length; i++) {
meta.writeLong(decode[i]);
encode.put(decode[i], i);
}
final int bitsRequired = PackedInts.bitsRequired(uniqueValues.size() - 1);
final PackedInts.Writer ordsWriter = PackedInts.getWriterNoHeader(data, PackedInts.Format.PACKED, (int) count, bitsRequired, PackedInts.DEFAULT_BUFFER_SIZE);
for (Number nv : values) {
ordsWriter.add(encode.get(nv == null ? 0 : nv.longValue()));
}
ordsWriter.finish();
break;
default:
throw new AssertionError();
}
}
// TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
// but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
void writeMissingBitset(Iterable> values) throws IOException {
byte bits = 0;
int count = 0;
for (Object v : values) {
if (count == 8) {
data.writeByte(bits);
count = 0;
bits = 0;
}
if (v != null) {
bits |= 1 << (count & 7);
}
count++;
}
if (count > 0) {
data.writeByte(bits);
}
}
@Override
public void addBinaryField(FieldInfo field, Iterable values) throws IOException {
// write the byte[] data
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.BINARY);
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
final long startFP = data.getFilePointer();
long count = 0;
boolean missing = false;
for(BytesRef v : values) {
final int length;
if (v == null) {
length = 0;
missing = true;
} else {
length = v.length;
}
minLength = Math.min(minLength, length);
maxLength = Math.max(maxLength, length);
if (v != null) {
data.writeBytes(v.bytes, v.offset, v.length);
}
count++;
}
meta.writeVInt(minLength == maxLength ? BINARY_FIXED_UNCOMPRESSED : BINARY_VARIABLE_UNCOMPRESSED);
if (missing) {
meta.writeLong(data.getFilePointer());
writeMissingBitset(values);
} else {
meta.writeLong(-1L);
}
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
meta.writeLong(startFP);
// if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
// otherwise, we need to record the length fields...
if (minLength != maxLength) {
meta.writeLong(data.getFilePointer());
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(BLOCK_SIZE);
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
for (BytesRef v : values) {
if (v != null) {
addr += v.length;
}
writer.add(addr);
}
writer.finish();
}
}
/** expert: writes a value dictionary for a sorted/sortedset field */
protected void addTermsDict(FieldInfo field, final Iterable values) throws IOException {
// first check if its a "fixed-length" terms dict
int minLength = Integer.MAX_VALUE;
int maxLength = Integer.MIN_VALUE;
for (BytesRef v : values) {
minLength = Math.min(minLength, v.length);
maxLength = Math.max(maxLength, v.length);
}
if (minLength == maxLength) {
// no index needed: direct addressing by mult
addBinaryField(field, values);
} else {
// header
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.BINARY);
meta.writeVInt(BINARY_PREFIX_COMPRESSED);
meta.writeLong(-1L);
// now write the bytes: sharing prefixes within a block
final long startFP = data.getFilePointer();
// currently, we have to store the delta from expected for every 1/nth term
// we could avoid this, but its not much and less overall RAM than the previous approach!
RAMOutputStream addressBuffer = new RAMOutputStream();
MonotonicBlockPackedWriter termAddresses = new MonotonicBlockPackedWriter(addressBuffer, BLOCK_SIZE);
BytesRef lastTerm = new BytesRef();
long count = 0;
for (BytesRef v : values) {
if (count % ADDRESS_INTERVAL == 0) {
termAddresses.add(data.getFilePointer() - startFP);
// force the first term in a block to be abs-encoded
lastTerm.length = 0;
}
// prefix-code
int sharedPrefix = StringHelper.bytesDifference(lastTerm, v);
data.writeVInt(sharedPrefix);
data.writeVInt(v.length - sharedPrefix);
data.writeBytes(v.bytes, v.offset + sharedPrefix, v.length - sharedPrefix);
lastTerm.copyBytes(v);
count++;
}
final long indexStartFP = data.getFilePointer();
// write addresses of indexed terms
termAddresses.finish();
addressBuffer.writeTo(data);
addressBuffer = null;
termAddresses = null;
meta.writeVInt(minLength);
meta.writeVInt(maxLength);
meta.writeVLong(count);
meta.writeLong(startFP);
meta.writeVInt(ADDRESS_INTERVAL);
meta.writeLong(indexStartFP);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeVInt(BLOCK_SIZE);
}
}
@Override
public void addSortedField(FieldInfo field, Iterable values, Iterable docToOrd) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.SORTED);
addTermsDict(field, values);
addNumericField(field, docToOrd, false);
}
private static boolean isSingleValued(Iterable docToOrdCount) {
for (Number ordCount : docToOrdCount) {
if (ordCount.longValue() > 1) {
return false;
}
}
return true;
}
@Override
public void addSortedSetField(FieldInfo field, Iterable values, final Iterable docToOrdCount, final Iterable ords) throws IOException {
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.SORTED_SET);
if (isSingleValued(docToOrdCount)) {
meta.writeVInt(SORTED_SET_SINGLE_VALUED_SORTED);
// The field is single-valued, we can encode it as SORTED
addSortedField(field, values, new Iterable() {
@Override
public Iterator iterator() {
final Iterator docToOrdCountIt = docToOrdCount.iterator();
final Iterator ordsIt = ords.iterator();
return new Iterator() {
@Override
public boolean hasNext() {
assert ordsIt.hasNext() ? docToOrdCountIt.hasNext() : true;
return docToOrdCountIt.hasNext();
}
@Override
public Number next() {
final Number ordCount = docToOrdCountIt.next();
if (ordCount.longValue() == 0) {
return MISSING_ORD;
} else {
assert ordCount.longValue() == 1;
return ordsIt.next();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
});
return;
}
meta.writeVInt(SORTED_SET_WITH_ADDRESSES);
// write the ord -> byte[] as a binary field
addTermsDict(field, values);
// write the stream of ords as a numeric field
// NOTE: we could return an iterator that delta-encodes these within a doc
addNumericField(field, ords, false);
// write the doc -> ord count as a absolute index to the stream
meta.writeVInt(field.number);
meta.writeByte(Lucene45DocValuesFormat.NUMERIC);
meta.writeVInt(DELTA_COMPRESSED);
meta.writeLong(-1L);
meta.writeVInt(PackedInts.VERSION_CURRENT);
meta.writeLong(data.getFilePointer());
meta.writeVLong(maxDoc);
meta.writeVInt(BLOCK_SIZE);
final MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
long addr = 0;
for (Number v : docToOrdCount) {
addr += v.longValue();
writer.add(addr);
}
writer.finish();
}
@Override
public void close() throws IOException {
boolean success = false;
try {
if (meta != null) {
meta.writeVInt(-1); // write EOF marker
}
success = true;
} finally {
if (success) {
IOUtils.close(data, meta);
} else {
IOUtils.closeWhileHandlingException(data, meta);
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy