org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
There is a newer version: 9.11.1
package org.apache.lucene.codecs.simpletext;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

/**
 * plain text doc values format.
 * 
 * FOR RECREATIONAL USE ONLY
 * 

 * the .dat file contains the data.
 *  for numbers this is a "fixed-width" file, for example a single byte range:
 *  
 *  field myField
 *    type NUMERIC
 *    minvalue 0
 *    pattern 000
 *  005
 *  T
 *  234
 *  T
 *  123
 *  T
 *  ...
 *  
 *  so a document's value (delta encoded from minvalue) can be retrieved by 
 *  seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline. 
 *  The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
 *  
 *  for bytes this is also a "fixed-width" file, for example:
 *   *  field myField
 *    type BINARY
 *    maxlength 6
 *    pattern 0
 *  length 6
 *  foobar[space][space]
 *  T
 *  length 3
 *  baz[space][space][space][space][space]
 *  T
 *  ...
 *  
 *  so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
 *  the extra 9 is 2 newlines, plus "length " itself.
 *  the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
 *  
 *  for sorted bytes this is a fixed-width file, for example:
 *   *  field myField
 *    type SORTED
 *    numvalues 10
 *    maxLength 8
 *    pattern 0
 *    ordpattern 00
 *  length 6
 *  foobar[space][space]
 *  length 3
 *  baz[space][space][space][space][space]
 *  ...
 *  03
 *  06
 *  01
 *  10
 *  ...
 *  
 *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
 *  a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
 *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
 *  
 *  for sorted set this is a fixed-width file very similar to the SORTED case, for example:
 *   *  field myField
 *    type SORTED_SET
 *    numvalues 10
 *    maxLength 8
 *    pattern 0
 *    ordpattern XXXXX
 *  length 6
 *  foobar[space][space]
 *  length 3
 *  baz[space][space][space][space][space]
 *  ...
 *  0,3,5   
 *  1,2
 *  
 *  10
 *  ...
 *  
 *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
 *  a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
 *  this is a comma-separated list, and its padded with spaces to be fixed width. so trim() and split() it.
 *  and beware the empty string!
 *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
 *   
 *  the reader can just scan this file when it opens, skipping over the data blocks
 *  and saving the offset/etc for each field. 
 *  @lucene.experimental
 */
public class SimpleTextDocValuesFormat extends DocValuesFormat {
  
  public SimpleTextDocValuesFormat() {
    super("SimpleText");
  }

  @Override
  public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    return new SimpleTextDocValuesWriter(state, "dat");
  }

  @Override
  public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
    return new SimpleTextDocValuesReader(state, "dat");
  }
}