All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.io.OffsetIndexedCorpus Maven / Gradle / Ivy

There is a newer version: 8.3.0
Show newest version
package edu.isi.nlp.io;

import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.base.Charsets;
import com.google.common.base.Optional;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.CacheLoader;
import com.google.common.cache.LoadingCache;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import edu.isi.nlp.files.FileUtils;
import edu.isi.nlp.files.KeyValueSource;
import edu.isi.nlp.symbols.Symbol;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.ExecutionException;

/**
 * Represents a corpus of concatenated text files which have been indexed via {@link
 * IndexFlatGigaword} or some similar procedure. Typically this is fine for backing user interfaces
 * (e.g. when it is too slow to copy all 12M files of Gigaword to the DMZ) but too slow for bulk
 * use.
 *
 * 

This should be merged into the newer {@link KeyValueSource} code. * * @author Ryan Gabbard */ public final class OffsetIndexedCorpus implements OriginalTextSource { private final DocIDToFileMapping corpusTextMapping; private final DocIDToFileMapping corpusIndexMapping; private final LoadingCache offsetIndexCache; private OffsetIndexedCorpus( final DocIDToFileMapping corpusTextMapping, final DocIDToFileMapping corpusIndexMapping, final LoadingCache offsetIndexCache) { this.corpusTextMapping = checkNotNull(corpusTextMapping); this.corpusIndexMapping = checkNotNull(corpusIndexMapping); this.offsetIndexCache = checkNotNull(offsetIndexCache); } public static OriginalTextSource fromTextAndOffsetFiles( DocIDToFileMapping corpusTextMapping, final DocIDToFileMapping corpusIndexMapping) { final LoadingCache indexCache = CacheBuilder.newBuilder() .maximumSize(3) .build( new CacheLoader() { @Override public OffsetIndex load(final File f) throws Exception { return OffsetIndices.readBinary(FileUtils.asCompressedByteSource(f)); } }); return new OffsetIndexedCorpus(corpusTextMapping, corpusIndexMapping, indexCache); } @Override public Optional getOriginalText(final Symbol docID) throws IOException { final Optional file = corpusTextMapping.fileForDocID(docID); if (file.isPresent()) { try { final Optional indexFile = corpusIndexMapping.fileForDocID(docID); if (indexFile.isPresent()) { final IndexedByteSource source = IndexedByteSource.from( Files.asByteSource(file.get()), offsetIndexCache.get(indexFile.get())); final Optional channelCharSource = source.channelAsCharSource(docID, Charsets.UTF_8); if (channelCharSource.isPresent()) { return Optional.of(channelCharSource.get().read()); } else { return Optional.absent(); } } else { throw new IOException("No index found for corpus chunk " + file); } } catch (ExecutionException e) { throw new IOException(e); } } else { return Optional.absent(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy