edu.isi.nlp.io.OffsetIndexedCorpus Maven / Gradle / Ivy
package edu.isi.nlp.io;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.common.base.Charsets;
import com.google.common.base.Optional;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import edu.isi.nlp.files.FileUtils;
import edu.isi.nlp.files.KeyValueSource;
import edu.isi.nlp.symbols.Symbol;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;
/**
* Represents a corpus of concatenated text files which have been indexed via {@link
* IndexFlatGigaword} or some similar procedure. Typically this is fine for backing user interfaces
* (e.g. when it is too slow to copy all 12M files of Gigaword to the DMZ) but too slow for bulk
* use.
*
* This should be merged into the newer {@link KeyValueSource} code.
*
* @author Ryan Gabbard
*/
public final class OffsetIndexedCorpus implements OriginalTextSource {
private final DocIDToFileMapping corpusTextMapping;
private final DocIDToFileMapping corpusIndexMapping;
private final LoadingCache offsetIndexCache;
private OffsetIndexedCorpus(
final DocIDToFileMapping corpusTextMapping,
final DocIDToFileMapping corpusIndexMapping,
final LoadingCache offsetIndexCache) {
this.corpusTextMapping = checkNotNull(corpusTextMapping);
this.corpusIndexMapping = checkNotNull(corpusIndexMapping);
this.offsetIndexCache = checkNotNull(offsetIndexCache);
}
public static OriginalTextSource fromTextAndOffsetFiles(
DocIDToFileMapping corpusTextMapping, final DocIDToFileMapping corpusIndexMapping) {
final LoadingCache indexCache =
CacheBuilder.newBuilder()
.maximumSize(3)
.build(
new CacheLoader() {
@Override
public OffsetIndex load(final File f) throws Exception {
return OffsetIndices.readBinary(FileUtils.asCompressedByteSource(f));
}
});
return new OffsetIndexedCorpus(corpusTextMapping, corpusIndexMapping, indexCache);
}
@Override
public Optional getOriginalText(final Symbol docID) throws IOException {
final Optional file = corpusTextMapping.fileForDocID(docID);
if (file.isPresent()) {
try {
final Optional indexFile = corpusIndexMapping.fileForDocID(docID);
if (indexFile.isPresent()) {
final IndexedByteSource source =
IndexedByteSource.from(
Files.asByteSource(file.get()), offsetIndexCache.get(indexFile.get()));
final Optional channelCharSource =
source.channelAsCharSource(docID, Charsets.UTF_8);
if (channelCharSource.isPresent()) {
return Optional.of(channelCharSource.get().read());
} else {
return Optional.absent();
}
} else {
throw new IOException("No index found for corpus chunk " + file);
}
} catch (ExecutionException e) {
throw new IOException(e);
}
} else {
return Optional.absent();
}
}
}