org.apache.lucene.codecs.simpletext.SimpleTextCompoundFormat Maven / Gradle / Ivy
Show all versions of lucene-codecs Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParseException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Locale;
import java.util.Set;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.ReadAdvice;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.StringHelper;
/**
* plain text compound format.
*
* FOR RECREATIONAL USE ONLY
*
* @lucene.experimental
*/
public class SimpleTextCompoundFormat extends CompoundFormat {
/** Sole constructor. */
public SimpleTextCompoundFormat() {}
@Override
public CompoundDirectory getCompoundReader(Directory dir, SegmentInfo si) throws IOException {
String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
final IndexInput in =
dir.openInput(dataFile, IOContext.DEFAULT.withReadAdvice(ReadAdvice.NORMAL));
BytesRefBuilder scratch = new BytesRefBuilder();
// first get to TOC:
DecimalFormat df =
new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
long pos = in.length() - TABLEPOS.length - OFFSETPATTERN.length() - 1;
in.seek(pos);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLEPOS);
long tablePos = -1;
try {
tablePos = df.parse(stripPrefix(scratch, TABLEPOS)).longValue();
} catch (ParseException e) {
throw new CorruptIndexException(
"can't parse CFS trailer, got: " + scratch.get().utf8ToString(), in, e);
}
// seek to TOC and read it
in.seek(tablePos);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLE);
int numEntries = Integer.parseInt(stripPrefix(scratch, TABLE));
final String[] fileNames = new String[numEntries];
final long[] startOffsets = new long[numEntries];
final long[] endOffsets = new long[numEntries];
for (int i = 0; i < numEntries; i++) {
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLENAME);
fileNames[i] = si.name + IndexFileNames.stripSegmentName(stripPrefix(scratch, TABLENAME));
if (i > 0) {
// files must be unique and in sorted order
assert fileNames[i].compareTo(fileNames[i - 1]) > 0;
}
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLESTART);
startOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLESTART));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), TABLEEND);
endOffsets[i] = Long.parseLong(stripPrefix(scratch, TABLEEND));
}
return new CompoundDirectory() {
private int getIndex(String name) throws IOException {
int index = Arrays.binarySearch(fileNames, name);
if (index < 0) {
throw new FileNotFoundException(
"No sub-file found (fileName="
+ name
+ " files: "
+ Arrays.toString(fileNames)
+ ")");
}
return index;
}
@Override
public String[] listAll() throws IOException {
ensureOpen();
return fileNames.clone();
}
@Override
public long fileLength(String name) throws IOException {
ensureOpen();
int index = getIndex(name);
return endOffsets[index] - startOffsets[index];
}
@Override
public IndexInput openInput(String name, IOContext context) throws IOException {
ensureOpen();
int index = getIndex(name);
return in.slice(
name,
startOffsets[index],
endOffsets[index] - startOffsets[index],
context.readAdvice());
}
@Override
public void close() throws IOException {
in.close();
}
@Override
public Set getPendingDeletions() throws IOException {
return Collections.emptySet();
}
@Override
public void checkIntegrity() throws IOException {
// No checksums for SimpleText
}
};
}
@Override
public void write(Directory dir, SegmentInfo si, IOContext context) throws IOException {
String dataFile = IndexFileNames.segmentFileName(si.name, "", DATA_EXTENSION);
int numFiles = si.files().size();
String[] names = si.files().toArray(new String[numFiles]);
Arrays.sort(names);
long[] startOffsets = new long[numFiles];
long[] endOffsets = new long[numFiles];
BytesRefBuilder scratch = new BytesRefBuilder();
try (IndexOutput out = dir.createOutput(dataFile, context)) {
for (int i = 0; i < names.length; i++) {
// write header for file
SimpleTextUtil.write(out, HEADER);
SimpleTextUtil.write(out, names[i], scratch);
SimpleTextUtil.writeNewline(out);
// write bytes for file
startOffsets[i] = out.getFilePointer();
try (IndexInput in = dir.openInput(names[i], IOContext.READONCE)) {
out.copyBytes(in, in.length());
}
endOffsets[i] = out.getFilePointer();
}
long tocPos = out.getFilePointer();
// write CFS table
SimpleTextUtil.write(out, TABLE);
SimpleTextUtil.write(out, Integer.toString(numFiles), scratch);
SimpleTextUtil.writeNewline(out);
for (int i = 0; i < names.length; i++) {
SimpleTextUtil.write(out, TABLENAME);
SimpleTextUtil.write(out, names[i], scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, TABLESTART);
SimpleTextUtil.write(out, Long.toString(startOffsets[i]), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, TABLEEND);
SimpleTextUtil.write(out, Long.toString(endOffsets[i]), scratch);
SimpleTextUtil.writeNewline(out);
}
DecimalFormat df =
new DecimalFormat(OFFSETPATTERN, DecimalFormatSymbols.getInstance(Locale.ROOT));
SimpleTextUtil.write(out, TABLEPOS);
SimpleTextUtil.write(out, df.format(tocPos), scratch);
SimpleTextUtil.writeNewline(out);
}
}
// helper method to strip strip away 'prefix' from 'scratch' and return as String
private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) {
return new String(
scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
}
/** Extension of compound file */
static final String DATA_EXTENSION = "scf";
static final BytesRef HEADER = new BytesRef("cfs entry for: ");
static final BytesRef TABLE = new BytesRef("table of contents, size: ");
static final BytesRef TABLENAME = new BytesRef(" filename: ");
static final BytesRef TABLESTART = new BytesRef(" start: ");
static final BytesRef TABLEEND = new BytesRef(" end: ");
static final BytesRef TABLEPOS = new BytesRef("table of contents begins at offset: ");
static final String OFFSETPATTERN;
static {
int numDigits = Long.toString(Long.MAX_VALUE).length();
char[] pattern = new char[numDigits];
Arrays.fill(pattern, '0');
OFFSETPATTERN = new String(pattern);
}
}