All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.simpletext.SimpleTextDocValuesReader Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
package org.apache.lucene.codecs.simpletext;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.END;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.FIELD;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.LENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXLENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MINVALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.NUMVALUES;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORDPATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.PATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.TYPE;

import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfo.DocValuesType;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.StringHelper;

class SimpleTextDocValuesReader extends DocValuesProducer {

  static class OneField {
    long dataStartFilePointer;
    String pattern;
    String ordPattern;
    int maxLength;
    boolean fixedLength;
    long minValue;
    long numValues;
  }

  final int maxDoc;
  final IndexInput data;
  final BytesRef scratch = new BytesRef();
  final Map fields = new HashMap();
  
  public SimpleTextDocValuesReader(SegmentReadState state, String ext) throws IOException {
    // System.out.println("dir=" + state.directory + " seg=" + state.segmentInfo.name + " file=" + IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext));
    data = state.directory.openInput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext), state.context);
    maxDoc = state.segmentInfo.getDocCount();
    while(true) {
      readLine();
      //System.out.println("READ field=" + scratch.utf8ToString());
      if (scratch.equals(END)) {
        break;
      }
      assert startsWith(FIELD) : scratch.utf8ToString();
      String fieldName = stripPrefix(FIELD);
      //System.out.println("  field=" + fieldName);

      OneField field = new OneField();
      fields.put(fieldName, field);

      readLine();
      assert startsWith(TYPE) : scratch.utf8ToString();

      DocValuesType dvType = DocValuesType.valueOf(stripPrefix(TYPE));
      assert dvType != null;
      if (dvType == DocValuesType.NUMERIC) {
        readLine();
        assert startsWith(MINVALUE): "got " + scratch.utf8ToString() + " field=" + fieldName + " ext=" + ext;
        field.minValue = Long.parseLong(stripPrefix(MINVALUE));
        readLine();
        assert startsWith(PATTERN);
        field.pattern = stripPrefix(PATTERN);
        field.dataStartFilePointer = data.getFilePointer();
        data.seek(data.getFilePointer() + (1+field.pattern.length()+2) * maxDoc);
      } else if (dvType == DocValuesType.BINARY) {
        readLine();
        assert startsWith(MAXLENGTH);
        field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
        readLine();
        assert startsWith(PATTERN);
        field.pattern = stripPrefix(PATTERN);
        field.dataStartFilePointer = data.getFilePointer();
        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength+2) * maxDoc);
      } else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
        readLine();
        assert startsWith(NUMVALUES);
        field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
        readLine();
        assert startsWith(MAXLENGTH);
        field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
        readLine();
        assert startsWith(PATTERN);
        field.pattern = stripPrefix(PATTERN);
        readLine();
        assert startsWith(ORDPATTERN);
        field.ordPattern = stripPrefix(ORDPATTERN);
        field.dataStartFilePointer = data.getFilePointer();
        data.seek(data.getFilePointer() + (9+field.pattern.length()+field.maxLength) * field.numValues + (1+field.ordPattern.length())*maxDoc);
      } else {
        throw new AssertionError();
      }
    }

    // We should only be called from above if at least one
    // field has DVs:
    assert !fields.isEmpty();
  }

  @Override
  public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);
    assert field != null;

    // SegmentCoreReaders already verifies this field is
    // valid:
    assert field != null: "field=" + fieldInfo.name + " fields=" + fields;

    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));

    decoder.setParseBigDecimal(true);

    return new NumericDocValues() {
      @Override
      public long get(int docID) {
        try {
          //System.out.println(Thread.currentThread().getName() + ": get docID=" + docID + " in=" + in);
          if (docID < 0 || docID >= maxDoc) {
            throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
          }
          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*docID);
          SimpleTextUtil.readLine(in, scratch);
          //System.out.println("parsing delta: " + scratch.utf8ToString());
          BigDecimal bd;
          try {
            bd = (BigDecimal) decoder.parse(scratch.utf8ToString());
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse BigDecimal value (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
          SimpleTextUtil.readLine(in, scratch); // read the line telling us if its real or not
          return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue();
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }
    };
  }
  
  private Bits getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);
    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    return new Bits() {
      @Override
      public boolean get(int index) {
        try {
          in.seek(field.dataStartFilePointer + (1+field.pattern.length()+2)*index);
          SimpleTextUtil.readLine(in, scratch); // data
          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
          return scratch.bytes[scratch.offset] == (byte) 'T';
        } catch (IOException e) {
          throw new RuntimeException(e);
        }
      }

      @Override
      public int length() {
        return maxDoc;
      }
    };
  }

  @Override
  public BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);

    // SegmentCoreReaders already verifies this field is
    // valid:
    assert field != null;

    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));

    return new BinaryDocValues() {
      @Override
      public void get(int docID, BytesRef result) {
        try {
          if (docID < 0 || docID >= maxDoc) {
            throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
          }
          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*docID);
          SimpleTextUtil.readLine(in, scratch);
          assert StringHelper.startsWith(scratch, LENGTH);
          int len;
          try {
            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
          result.bytes = new byte[len];
          result.offset = 0;
          result.length = len;
          in.readBytes(result.bytes, 0, len);
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }
    };
  }
  
  private Bits getBinaryDocsWithField(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);
    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));

    return new Bits() {
      @Override
      public boolean get(int index) {
        try {
          in.seek(field.dataStartFilePointer + (9+field.pattern.length() + field.maxLength+2)*index);
          SimpleTextUtil.readLine(in, scratch);
          assert StringHelper.startsWith(scratch, LENGTH);
          int len;
          try {
            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
          // skip past bytes
          byte bytes[] = new byte[len];
          in.readBytes(bytes, 0, len);
          SimpleTextUtil.readLine(in, scratch); // newline
          SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
          return scratch.bytes[scratch.offset] == (byte) 'T';
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }

      @Override
      public int length() {
        return maxDoc;
      }
    };
  }

  @Override
  public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);

    // SegmentCoreReaders already verifies this field is
    // valid:
    assert field != null;

    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
    final DecimalFormat ordDecoder = new DecimalFormat(field.ordPattern, new DecimalFormatSymbols(Locale.ROOT));

    return new SortedDocValues() {
      @Override
      public int getOrd(int docID) {
        if (docID < 0 || docID >= maxDoc) {
          throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
        }
        try {
          in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
          SimpleTextUtil.readLine(in, scratch);
          try {
            return (int) ordDecoder.parse(scratch.utf8ToString()).longValue()-1;
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse ord (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }

      @Override
      public void lookupOrd(int ord, BytesRef result) {
        try {
          if (ord < 0 || ord >= field.numValues) {
            throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
          }
          in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
          SimpleTextUtil.readLine(in, scratch);
          assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in;
          int len;
          try {
            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
          result.bytes = new byte[len];
          result.offset = 0;
          result.length = len;
          in.readBytes(result.bytes, 0, len);
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }

      @Override
      public int getValueCount() {
        return (int)field.numValues;
      }
    };
  }

  @Override
  public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
    final OneField field = fields.get(fieldInfo.name);

    // SegmentCoreReaders already verifies this field is
    // valid:
    assert field != null;

    final IndexInput in = data.clone();
    final BytesRef scratch = new BytesRef();
    final DecimalFormat decoder = new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
    
    return new SortedSetDocValues() {
      String[] currentOrds = new String[0];
      int currentIndex = 0;
      
      @Override
      public long nextOrd() {
        if (currentIndex == currentOrds.length) {
          return NO_MORE_ORDS;
        } else {
          return Long.parseLong(currentOrds[currentIndex++]);
        }
      }

      @Override
      public void setDocument(int docID) {
        if (docID < 0 || docID >= maxDoc) {
          throw new IndexOutOfBoundsException("docID must be 0 .. " + (maxDoc-1) + "; got " + docID);
        }
        try {
          in.seek(field.dataStartFilePointer + field.numValues * (9 + field.pattern.length() + field.maxLength) + docID * (1 + field.ordPattern.length()));
          SimpleTextUtil.readLine(in, scratch);
          String ordList = scratch.utf8ToString().trim();
          if (ordList.isEmpty()) {
            currentOrds = new String[0];
          } else {
            currentOrds = ordList.split(",");
          }
          currentIndex = 0;
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }

      @Override
      public void lookupOrd(long ord, BytesRef result) {
        try {
          if (ord < 0 || ord >= field.numValues) {
            throw new IndexOutOfBoundsException("ord must be 0 .. " + (field.numValues-1) + "; got " + ord);
          }
          in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
          SimpleTextUtil.readLine(in, scratch);
          assert StringHelper.startsWith(scratch, LENGTH): "got " + scratch.utf8ToString() + " in=" + in;
          int len;
          try {
            len = decoder.parse(new String(scratch.bytes, scratch.offset + LENGTH.length, scratch.length - LENGTH.length, "UTF-8")).intValue();
          } catch (ParseException pe) {
            CorruptIndexException e = new CorruptIndexException("failed to parse int length (resource=" + in + ")");
            e.initCause(pe);
            throw e;
          }
          result.bytes = new byte[len];
          result.offset = 0;
          result.length = len;
          in.readBytes(result.bytes, 0, len);
        } catch (IOException ioe) {
          throw new RuntimeException(ioe);
        }
      }

      @Override
      public long getValueCount() {
        return field.numValues;
      }
    };
  }
  
  @Override
  public Bits getDocsWithField(FieldInfo field) throws IOException {
    switch (field.getDocValuesType()) {
      case SORTED_SET:
        return new SortedSetDocsWithField(getSortedSet(field), maxDoc);
      case SORTED:
        return new SortedDocsWithField(getSorted(field), maxDoc);
      case BINARY:
        return getBinaryDocsWithField(field);
      case NUMERIC:
        return getNumericDocsWithField(field);
      default:
        throw new AssertionError();
    }
  }

  @Override
  public void close() throws IOException {
    data.close();
  }

  /** Used only in ctor: */
  private void readLine() throws IOException {
    SimpleTextUtil.readLine(data, scratch);
    //System.out.println("line: " + scratch.utf8ToString());
  }

  /** Used only in ctor: */
  private boolean startsWith(BytesRef prefix) {
    return StringHelper.startsWith(scratch, prefix);
  }

  /** Used only in ctor: */
  private String stripPrefix(BytesRef prefix) throws IOException {
    return new String(scratch.bytes, scratch.offset + prefix.length, scratch.length - prefix.length, "UTF-8");
  }

  @Override
  public long ramBytesUsed() {
    return 0;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy