org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* plain text doc values format.
*
* FOR RECREATIONAL USE ONLY
*
*
the .dat file contains the data. for numbers this is a "fixed-width" file, for example a
* single byte range:
*
*
* field myField
* type NUMERIC
* minvalue 0
* pattern 000
* 005
* T
* 234
* T
* 123
* T
* ...
*
*
* so a document's value (delta encoded from minvalue) can be retrieved by seeking to startOffset +
* (1+pattern.length()+2)*docid. The extra 1 is the newline. The extra 2 is another newline and 'T'
* or 'F': true if the value is real, false if missing.
*
* for bytes this is also a "fixed-width" file, for example:
*
*
* field myField
* type BINARY
* maxlength 6
* pattern 0
* length 6
* foobar[space][space]
* T
* length 3
* baz[space][space][space][space][space]
* T
* ...
*
*
* so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
* the extra 9 is 2 newlines, plus "length " itself. the extra 2 is another newline and 'T' or 'F':
* true if the value is real, false if missing.
*
* for sorted bytes this is a fixed-width file, for example:
*
*
* field myField
* type SORTED
* numvalues 10
* maxLength 8
* pattern 0
* ordpattern 00
* length 6
* foobar[space][space]
* length 3
* baz[space][space][space][space][space]
* ...
* 03
* 06
* 01
* 10
* ...
*
*
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
* ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid an ord's value
* can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
*
* for sorted set this is a fixed-width file very similar to the SORTED case, for example:
*
*
* field myField
* type SORTED_SET
* numvalues 10
* maxLength 8
* pattern 0
* ordpattern XXXXX
* length 6
* foobar[space][space]
* length 3
* baz[space][space][space][space][space]
* ...
* 0,3,5
* 1,2
*
* 10
* ...
*
*
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
* ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid this is a
* comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
* and beware the empty string! an ord's value can be retrieved by seeking to startOffset +
* (9+pattern.length+maxlength)*ord
*
* for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings
* the same as binary. beware the empty string!
*
*
the reader can just scan this file when it opens, skipping over the data blocks and saving the
* offset/etc for each field.
*
* @lucene.experimental
*/
class SimpleTextDocValuesFormat extends DocValuesFormat {
public SimpleTextDocValuesFormat() {
super("SimpleText");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new SimpleTextDocValuesWriter(state, "dat");
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new SimpleTextDocValuesReader(state, "dat");
}
}