org.apache.lucene.codecs.simpletext.SimpleTextTermVectorsWriter Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import org.apache.lucene.codecs.TermVectorsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
/**
* Writes plain-text term vectors.
*
* FOR RECREATIONAL USE ONLY
*
* @lucene.experimental
*/
public class SimpleTextTermVectorsWriter extends TermVectorsWriter {
static final BytesRef END = new BytesRef("END");
static final BytesRef DOC = new BytesRef("doc ");
static final BytesRef NUMFIELDS = new BytesRef(" numfields ");
static final BytesRef FIELD = new BytesRef(" field ");
static final BytesRef FIELDNAME = new BytesRef(" name ");
static final BytesRef FIELDPOSITIONS = new BytesRef(" positions ");
static final BytesRef FIELDOFFSETS = new BytesRef(" offsets ");
static final BytesRef FIELDPAYLOADS = new BytesRef(" payloads ");
static final BytesRef FIELDTERMCOUNT = new BytesRef(" numterms ");
static final BytesRef TERMTEXT = new BytesRef(" term ");
static final BytesRef TERMFREQ = new BytesRef(" freq ");
static final BytesRef POSITION = new BytesRef(" position ");
static final BytesRef PAYLOAD = new BytesRef(" payload ");
static final BytesRef STARTOFFSET = new BytesRef(" startoffset ");
static final BytesRef ENDOFFSET = new BytesRef(" endoffset ");
static final String VECTORS_EXTENSION = "tvc";
private IndexOutput out;
private int numDocsWritten = 0;
private final BytesRefBuilder scratch = new BytesRefBuilder();
private boolean offsets;
private boolean positions;
private boolean payloads;
public SimpleTextTermVectorsWriter(Directory directory, String segment, IOContext context)
throws IOException {
boolean success = false;
try {
out =
directory.createOutput(
IndexFileNames.segmentFileName(segment, "", VECTORS_EXTENSION), context);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this);
}
}
}
@Override
public void startDocument(int numVectorFields) throws IOException {
write(DOC);
write(Integer.toString(numDocsWritten));
newLine();
write(NUMFIELDS);
write(Integer.toString(numVectorFields));
newLine();
numDocsWritten++;
}
@Override
public void startField(
FieldInfo info, int numTerms, boolean positions, boolean offsets, boolean payloads)
throws IOException {
write(FIELD);
write(Integer.toString(info.number));
newLine();
write(FIELDNAME);
write(info.name);
newLine();
write(FIELDPOSITIONS);
write(Boolean.toString(positions));
newLine();
write(FIELDOFFSETS);
write(Boolean.toString(offsets));
newLine();
write(FIELDPAYLOADS);
write(Boolean.toString(payloads));
newLine();
write(FIELDTERMCOUNT);
write(Integer.toString(numTerms));
newLine();
this.positions = positions;
this.offsets = offsets;
this.payloads = payloads;
}
@Override
public void startTerm(BytesRef term, int freq) throws IOException {
write(TERMTEXT);
write(term);
newLine();
write(TERMFREQ);
write(Integer.toString(freq));
newLine();
}
@Override
public void addPosition(int position, int startOffset, int endOffset, BytesRef payload)
throws IOException {
assert positions || offsets;
if (positions) {
write(POSITION);
write(Integer.toString(position));
newLine();
if (payloads) {
write(PAYLOAD);
if (payload != null) {
assert payload.length > 0;
write(payload);
}
newLine();
}
}
if (offsets) {
write(STARTOFFSET);
write(Integer.toString(startOffset));
newLine();
write(ENDOFFSET);
write(Integer.toString(endOffset));
newLine();
}
}
@Override
public void finish(int numDocs) throws IOException {
if (numDocsWritten != numDocs) {
throw new RuntimeException(
"mergeVectors produced an invalid result: mergedDocs is "
+ numDocs
+ " but vec numDocs is "
+ numDocsWritten
+ " file="
+ out.toString()
+ "; now aborting this merge to prevent index corruption");
}
write(END);
newLine();
SimpleTextUtil.writeChecksum(out, scratch);
}
@Override
public void close() throws IOException {
try {
IOUtils.close(out);
} finally {
out = null;
}
}
private void write(String s) throws IOException {
SimpleTextUtil.write(out, s, scratch);
}
private void write(BytesRef bytes) throws IOException {
SimpleTextUtil.write(out, bytes);
}
private void newLine() throws IOException {
SimpleTextUtil.writeNewline(out);
}
@Override
public long ramBytesUsed() {
return scratch.get().bytes.length;
}
}