All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.simpletext.SimpleTextDocValuesFormat Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.simpletext;

import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

/**
 * plain text doc values format.
 *
 * 

FOR RECREATIONAL USE ONLY * *

the .dat file contains the data. for numbers this is a "fixed-width" file, for example a * single byte range: * *

 *  field myField
 *    type NUMERIC
 *    minvalue 0
 *    pattern 000
 *  005
 *  T
 *  234
 *  T
 *  123
 *  T
 *  ...
 *  
* * so a document's value (delta encoded from minvalue) can be retrieved by seeking to startOffset + * (1+pattern.length()+2)*docid. The extra 1 is the newline. The extra 2 is another newline and 'T' * or 'F': true if the value is real, false if missing. * *

for bytes this is also a "fixed-width" file, for example: * *

 *  field myField
 *    type BINARY
 *    maxlength 6
 *    pattern 0
 *  length 6
 *  foobar[space][space]
 *  T
 *  length 3
 *  baz[space][space][space][space][space]
 *  T
 *  ...
 *  
* * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc * the extra 9 is 2 newlines, plus "length " itself. the extra 2 is another newline and 'T' or 'F': * true if the value is real, false if missing. * *

for sorted bytes this is a fixed-width file, for example: * *

 *  field myField
 *    type SORTED
 *    numvalues 10
 *    maxLength 8
 *    pattern 0
 *    ordpattern 00
 *  length 6
 *  foobar[space][space]
 *  length 3
 *  baz[space][space][space][space][space]
 *  ...
 *  03
 *  06
 *  01
 *  10
 *  ...
 *  
* * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's * ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid an ord's value * can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord * *

for sorted set this is a fixed-width file very similar to the SORTED case, for example: * *

 *  field myField
 *    type SORTED_SET
 *    numvalues 10
 *    maxLength 8
 *    pattern 0
 *    ordpattern XXXXX
 *  length 6
 *  foobar[space][space]
 *  length 3
 *  baz[space][space][space][space][space]
 *  ...
 *  0,3,5
 *  1,2
 *
 *  10
 *  ...
 *  
* * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's * ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid this is a * comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it. * and beware the empty string! an ord's value can be retrieved by seeking to startOffset + * (9+pattern.length+maxlength)*ord * *

for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings * the same as binary. beware the empty string! * *

the reader can just scan this file when it opens, skipping over the data blocks and saving the * offset/etc for each field. * * @lucene.experimental */ class SimpleTextDocValuesFormat extends DocValuesFormat { public SimpleTextDocValuesFormat() { super("SimpleText"); } @Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new SimpleTextDocValuesWriter(state, "dat"); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { return new SimpleTextDocValuesReader(state, "dat"); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy