All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.simpletext;

import java.io.IOException;
import java.math.BigInteger;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.EmptyDocValuesProducer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;

class SimpleTextDocValuesWriter extends DocValuesConsumer {
  static final BytesRef END = new BytesRef("END");
  static final BytesRef FIELD = new BytesRef("field ");
  static final BytesRef TYPE = new BytesRef("  type ");
  static final BytesRef DOCCOUNT = new BytesRef("  doccount ");
  // used for numerics
  static final BytesRef ORIGIN = new BytesRef("  origin "); // for deltas

  static final BytesRef MINVALUE = new BytesRef("  minalue ");
  static final BytesRef MAXVALUE = new BytesRef("  maxvalue ");

  static final BytesRef PATTERN = new BytesRef("  pattern ");
  // used for bytes
  static final BytesRef LENGTH = new BytesRef("length ");
  static final BytesRef MAXLENGTH = new BytesRef("  maxlength ");
  // used for sorted bytes
  static final BytesRef NUMVALUES = new BytesRef("  numvalues ");
  static final BytesRef ORDPATTERN = new BytesRef("  ordpattern ");

  IndexOutput data;
  final BytesRefBuilder scratch = new BytesRefBuilder();
  final int numDocs;
  private final Set fieldsSeen = new HashSet<>(); // for asserting

  public SimpleTextDocValuesWriter(SegmentWriteState state, String ext) throws IOException {
    // System.out.println("WRITE: " + IndexFileNames.segmentFileName(state.segmentInfo.name,
    // state.segmentSuffix, ext) + " " + state.segmentInfo.maxDoc() + " docs");
    data =
        state.directory.createOutput(
            IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext),
            state.context);
    numDocs = state.segmentInfo.maxDoc();
  }

  // for asserting
  private boolean fieldSeen(String field) {
    assert !fieldsSeen.contains(field)
        : "field \"" + field + "\" was added more than once during flush";
    fieldsSeen.add(field);
    return true;
  }

  @Override
  public void addNumericField(FieldInfo field, DocValuesProducer valuesProducer)
      throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.NUMERIC || field.hasNorms();
    writeFieldEntry(field, DocValuesType.NUMERIC);

    // first pass to find min/max
    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
    NumericDocValues values = valuesProducer.getNumeric(field);
    int numValues = 0;
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      long v = values.longValue();
      minValue = Math.min(minValue, v);
      maxValue = Math.max(maxValue, v);
      numValues++;
    }

    // write absolute min and max for skipper
    SimpleTextUtil.write(data, MINVALUE);
    SimpleTextUtil.write(data, Long.toString(minValue), scratch);
    SimpleTextUtil.writeNewline(data);

    SimpleTextUtil.write(data, MAXVALUE);
    SimpleTextUtil.write(data, Long.toString(maxValue), scratch);
    SimpleTextUtil.writeNewline(data);

    SimpleTextUtil.write(data, DOCCOUNT);
    SimpleTextUtil.write(data, Integer.toString(numValues), scratch);
    SimpleTextUtil.writeNewline(data);

    if (numValues != numDocs) {
      minValue = Math.min(minValue, 0);
      maxValue = Math.max(maxValue, 0);
    }

    // write our minimum value to the .dat, all entries are deltas from that
    SimpleTextUtil.write(data, ORIGIN);
    SimpleTextUtil.write(data, Long.toString(minValue), scratch);
    SimpleTextUtil.writeNewline(data);

    // build up our fixed-width "simple text packed ints"
    // format
    BigInteger maxBig = BigInteger.valueOf(maxValue);
    BigInteger minBig = BigInteger.valueOf(minValue);
    BigInteger diffBig = maxBig.subtract(minBig);
    int maxBytesPerValue = diffBig.toString().length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesPerValue; i++) {
      sb.append('0');
    }

    // write our pattern to the .dat
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);

    final String patternString = sb.toString();

    final DecimalFormat encoder =
        new DecimalFormat(patternString, new DecimalFormatSymbols(Locale.ROOT));

    int numDocsWritten = 0;

    // second pass to write the values
    values = valuesProducer.getNumeric(field);
    for (int i = 0; i < numDocs; ++i) {
      if (values.docID() < i) {
        values.nextDoc();
        assert values.docID() >= i;
      }
      long value = values.docID() != i ? 0 : values.longValue();
      assert value >= minValue;
      Number delta = BigInteger.valueOf(value).subtract(BigInteger.valueOf(minValue));
      String s = encoder.format(delta);
      assert s.length() == patternString.length();
      SimpleTextUtil.write(data, s, scratch);
      SimpleTextUtil.writeNewline(data);
      if (values.docID() != i) {
        SimpleTextUtil.write(data, "F", scratch);
      } else {
        SimpleTextUtil.write(data, "T", scratch);
      }
      SimpleTextUtil.writeNewline(data);
      numDocsWritten++;
      assert numDocsWritten <= numDocs;
    }

    assert numDocs == numDocsWritten : "numDocs=" + numDocs + " numDocsWritten=" + numDocsWritten;
  }

  @Override
  public void addBinaryField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.BINARY;
    writeFieldEntry(field, DocValuesType.BINARY);
    doAddBinaryField(field, valuesProducer);
  }

  private void doAddBinaryField(FieldInfo field, DocValuesProducer valuesProducer)
      throws IOException {
    int maxLength = 0;
    BinaryDocValues values = valuesProducer.getBinary(field);
    int docCount = 0;
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      ++docCount;
      maxLength = Math.max(maxLength, values.binaryValue().toString().length());
    }

    SimpleTextUtil.write(data, DOCCOUNT);
    SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
    SimpleTextUtil.writeNewline(data);

    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);

    int maxBytesLength = Long.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
      sb.append('0');
    }
    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder =
        new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));

    values = valuesProducer.getBinary(field);
    int numDocsWritten = 0;
    for (int i = 0; i < numDocs; ++i) {
      if (values.docID() < i) {
        values.nextDoc();
        assert values.docID() >= i;
      }
      String stringVal = values.docID() == i ? values.binaryValue().toString() : null;
      // write length
      final int length = stringVal == null ? 0 : stringVal.length();
      SimpleTextUtil.write(data, LENGTH);
      SimpleTextUtil.write(data, encoder.format(length), scratch);
      SimpleTextUtil.writeNewline(data);

      // write bytes as hex array
      if (stringVal != null) {
        SimpleTextUtil.write(data, stringVal, scratch);
      }

      // pad to fit
      for (int j = length; j < maxLength; j++) {
        data.writeByte((byte) ' ');
      }
      SimpleTextUtil.writeNewline(data);
      if (stringVal == null) {
        SimpleTextUtil.write(data, "F", scratch);
      } else {
        SimpleTextUtil.write(data, "T", scratch);
      }
      SimpleTextUtil.writeNewline(data);
      numDocsWritten++;
    }

    assert numDocs == numDocsWritten;
  }

  @Override
  public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED;
    writeFieldEntry(field, DocValuesType.SORTED);

    int docCount = 0;
    SortedDocValues values = valuesProducer.getSorted(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      ++docCount;
    }
    SimpleTextUtil.write(data, DOCCOUNT);
    SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
    SimpleTextUtil.writeNewline(data);

    int valueCount = 0;
    int maxLength = -1;
    TermsEnum terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
      maxLength = Math.max(maxLength, value.length);
      valueCount++;
    }

    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Integer.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);

    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);

    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
      sb.append('0');
    }

    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder =
        new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));

    int maxOrdBytes = Long.toString(valueCount + 1L).length();
    sb.setLength(0);
    for (int i = 0; i < maxOrdBytes; i++) {
      sb.append('0');
    }

    // write our pattern for ords
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat ordEncoder =
        new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));

    // for asserts:
    int valuesSeen = 0;

    terms = valuesProducer.getSorted(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
      // write length
      SimpleTextUtil.write(data, LENGTH);
      SimpleTextUtil.write(data, encoder.format(value.length), scratch);
      SimpleTextUtil.writeNewline(data);

      // write bytes -- don't use SimpleText.write
      // because it escapes:
      data.writeBytes(value.bytes, value.offset, value.length);

      // pad to fit
      for (int i = value.length; i < maxLength; i++) {
        data.writeByte((byte) ' ');
      }
      SimpleTextUtil.writeNewline(data);
      valuesSeen++;
      assert valuesSeen <= valueCount;
    }

    assert valuesSeen == valueCount;

    values = valuesProducer.getSorted(field);
    for (int i = 0; i < numDocs; ++i) {
      if (values.docID() < i) {
        values.nextDoc();
        assert values.docID() >= i;
      }
      int ord = -1;
      if (values.docID() == i) {
        ord = values.ordValue();
      }
      SimpleTextUtil.write(data, ordEncoder.format(ord + 1L), scratch);
      SimpleTextUtil.writeNewline(data);
    }
  }

  @Override
  public void addSortedNumericField(FieldInfo field, final DocValuesProducer valuesProducer)
      throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED_NUMERIC;
    writeFieldEntry(field, DocValuesType.SORTED_NUMERIC);

    long minValue = Long.MAX_VALUE;
    long maxValue = Long.MIN_VALUE;
    SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      for (int i = 0; i < values.docValueCount(); ++i) {
        long v = values.nextValue();
        minValue = Math.min(minValue, v);
        maxValue = Math.max(maxValue, v);
      }
    }

    // write absolute min and max for skipper
    SimpleTextUtil.write(data, MINVALUE);
    SimpleTextUtil.write(data, Long.toString(minValue), scratch);
    SimpleTextUtil.writeNewline(data);

    SimpleTextUtil.write(data, MAXVALUE);
    SimpleTextUtil.write(data, Long.toString(maxValue), scratch);
    SimpleTextUtil.writeNewline(data);

    doAddBinaryField(
        field,
        new EmptyDocValuesProducer() {
          @Override
          public BinaryDocValues getBinary(FieldInfo field) throws IOException {
            SortedNumericDocValues values = valuesProducer.getSortedNumeric(field);
            return new BinaryDocValues() {

              @Override
              public int nextDoc() throws IOException {
                int doc = values.nextDoc();
                setCurrentDoc();
                return doc;
              }

              @Override
              public int docID() {
                return values.docID();
              }

              @Override
              public long cost() {
                return values.cost();
              }

              @Override
              public int advance(int target) throws IOException {
                int doc = values.advance(target);
                setCurrentDoc();
                return doc;
              }

              @Override
              public boolean advanceExact(int target) throws IOException {
                if (values.advanceExact(target)) {
                  setCurrentDoc();
                  return true;
                }
                return false;
              }

              final StringBuilder builder = new StringBuilder();
              BytesRef binaryValue;

              private void setCurrentDoc() throws IOException {
                if (docID() == NO_MORE_DOCS) {
                  return;
                }
                builder.setLength(0);
                for (int i = 0, count = values.docValueCount(); i < count; ++i) {
                  if (i > 0) {
                    builder.append(',');
                  }
                  builder.append(Long.toString(values.nextValue()));
                }
                binaryValue = new BytesRef(builder.toString());
              }

              @Override
              public BytesRef binaryValue() throws IOException {
                return binaryValue;
              }
            };
          }
        });
  }

  @Override
  public void addSortedSetField(FieldInfo field, DocValuesProducer valuesProducer)
      throws IOException {
    assert fieldSeen(field.name);
    assert field.getDocValuesType() == DocValuesType.SORTED_SET;
    writeFieldEntry(field, DocValuesType.SORTED_SET);

    int docCount = 0;
    SortedSetDocValues values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      ++docCount;
    }
    SimpleTextUtil.write(data, DOCCOUNT);
    SimpleTextUtil.write(data, Integer.toString(docCount), scratch);
    SimpleTextUtil.writeNewline(data);

    long valueCount = 0;
    int maxLength = 0;
    TermsEnum terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
      maxLength = Math.max(maxLength, value.length);
      valueCount++;
    }

    // write numValues
    SimpleTextUtil.write(data, NUMVALUES);
    SimpleTextUtil.write(data, Long.toString(valueCount), scratch);
    SimpleTextUtil.writeNewline(data);

    // write maxLength
    SimpleTextUtil.write(data, MAXLENGTH);
    SimpleTextUtil.write(data, Integer.toString(maxLength), scratch);
    SimpleTextUtil.writeNewline(data);

    int maxBytesLength = Integer.toString(maxLength).length();
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < maxBytesLength; i++) {
      sb.append('0');
    }

    // write our pattern for encoding lengths
    SimpleTextUtil.write(data, PATTERN);
    SimpleTextUtil.write(data, sb.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
    final DecimalFormat encoder =
        new DecimalFormat(sb.toString(), new DecimalFormatSymbols(Locale.ROOT));

    // compute ord pattern: this is funny, we encode all values for all docs to find the maximum
    // length
    int maxOrdListLength = 0;
    StringBuilder sb2 = new StringBuilder();
    values = valuesProducer.getSortedSet(field);
    for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
      sb2.setLength(0);
      for (int i = 0; i < values.docValueCount(); i++) {
        if (sb2.length() > 0) {
          sb2.append(",");
        }
        sb2.append(Long.toString(values.nextOrd()));
      }
      maxOrdListLength = Math.max(maxOrdListLength, sb2.length());
    }

    sb2.setLength(0);
    for (int i = 0; i < maxOrdListLength; i++) {
      sb2.append('X');
    }

    // write our pattern for ord lists
    SimpleTextUtil.write(data, ORDPATTERN);
    SimpleTextUtil.write(data, sb2.toString(), scratch);
    SimpleTextUtil.writeNewline(data);

    // for asserts:
    long valuesSeen = 0;

    terms = valuesProducer.getSortedSet(field).termsEnum();
    for (BytesRef value = terms.next(); value != null; value = terms.next()) {
      // write length
      SimpleTextUtil.write(data, LENGTH);
      SimpleTextUtil.write(data, encoder.format(value.length), scratch);
      SimpleTextUtil.writeNewline(data);

      // write bytes -- don't use SimpleText.write
      // because it escapes:
      data.writeBytes(value.bytes, value.offset, value.length);

      // pad to fit
      for (int i = value.length; i < maxLength; i++) {
        data.writeByte((byte) ' ');
      }
      SimpleTextUtil.writeNewline(data);
      valuesSeen++;
      assert valuesSeen <= valueCount;
    }

    assert valuesSeen == valueCount;

    values = valuesProducer.getSortedSet(field);

    // write the ords for each doc comma-separated
    for (int i = 0; i < numDocs; ++i) {
      if (values.docID() < i) {
        values.nextDoc();
        assert values.docID() >= i;
      }
      sb2.setLength(0);
      if (values.docID() == i) {
        for (int j = 0; j < values.docValueCount(); j++) {
          if (sb2.length() > 0) {
            sb2.append(",");
          }
          sb2.append(Long.toString(values.nextOrd()));
        }
      }
      // now pad to fit: these are numbers so spaces work well. reader calls trim()
      int numPadding = maxOrdListLength - sb2.length();
      for (int j = 0; j < numPadding; j++) {
        sb2.append(' ');
      }
      SimpleTextUtil.write(data, sb2.toString(), scratch);
      SimpleTextUtil.writeNewline(data);
    }
  }

  /** write the header for this field */
  private void writeFieldEntry(FieldInfo field, DocValuesType type) throws IOException {
    SimpleTextUtil.write(data, FIELD);
    SimpleTextUtil.write(data, field.name, scratch);
    SimpleTextUtil.writeNewline(data);

    SimpleTextUtil.write(data, TYPE);
    SimpleTextUtil.write(data, type.toString(), scratch);
    SimpleTextUtil.writeNewline(data);
  }

  @Override
  public void close() throws IOException {
    if (data != null) {
      boolean success = false;
      try {
        assert !fieldsSeen.isEmpty();
        // TODO: sheisty to do this here?
        SimpleTextUtil.write(data, END);
        SimpleTextUtil.writeNewline(data);
        SimpleTextUtil.writeChecksum(data, scratch);
        success = true;
      } finally {
        if (success) {
          IOUtils.close(data);
        } else {
          IOUtils.closeWhileHandlingException(data);
        }
        data = null;
      }
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy