io.github.repir.Repository.Repository Maven / Gradle / Ivy
The newest version!
package io.github.repir.Repository;
import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.Repository.Stopwords.StopWords;
import io.github.repir.Repository.Stopwords.StopwordsCache;
import io.github.repir.Strategy.Strategy;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.Path;
import io.github.repir.tools.io.HDFSPath;
import io.github.repir.tools.lib.ArrayTools;
import static io.github.repir.tools.lib.ClassTools.*;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.MathTools;
import io.github.repir.tools.lib.PrintTools;
import io.github.repir.tools.lib.StrTools;
import io.github.repir.MapReduceTools.RRConfiguration;
import io.github.repir.tools.extract.ExtractorConf;
import io.github.repir.tools.Words.englishStemmer;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileSystem;
/**
* The Repository manages all persistent data for a collection : features that
* were extracted from the collection, configuration settings and additional
* data files.
*
* The Repository is THE central component in RepIR. Each collection is
* converted into its own Repository of extracted features. The extraction
* process and the StoredFeatures can be tailor made, programs can obtain
* low-level access to the stored data, new features can be added and
* StoredDynamicFeatures can be used as small size tables to store data that can
* be modified.
*
* The configuration of a Repository, and the communication of settings for
* tasks is done through an extension of Hadoop's Configuration class, of which
* a single instance resides in the Repository, that can be accessed using
* {@link #getConf()}, {@link #configuredString(java.lang.String)},
* etc. The Configuration settings are usually seeded through configuration
* files, but can also be added through the command line or from code.
*
* For low level access to a {@link StoredFeature}, you should obtain the
* feature through the Repository with {@link #getFeature(java.lang.Class, java.lang.String[])
* }
* using the feature's Class, and optional parameters.
*/
public class Repository {
public static Log log = new Log(Repository.class);
protected HDFSPath basedir; // dir on HDFS containing the repository files
protected String prefix; // prefix for every file, usually configname
protected FileSystem fs = null; // leave null for local FS, otherwise use HDFS
protected static final String MASTERFILE_EXTENSION = ".master";
protected long documentcount; // number of documents in collection
public static final double DEFAULT_LOAD_FACTOR = 0.75;
protected int hashtablecapacity; // for the vocabulary hashtable
protected int vocabularysize; // number of words in vocabulary
protected long cf; // number of words in collection
protected VocabularyToID vocabulary;
protected int partitions = 1; // number of partitions the repository is divided in
protected PartitionLocation partitionlocation; // gives fast access to the man location of each patition
public HashMap storedfeaturesmap = new HashMap();
private CollectionID collectionid;
protected RRConfiguration configuration = new RRConfiguration();
/**
* Constructor for the creation of a new Repository with
* {@link VocabularyBuilder}.
*
* @param basedirname directory where the repository is stored
* @param prefix prefix for all repository filenames (usually the repository
* name)
*/
public Repository(HDFSPath basedirname, String prefix) {
setDirPrefix(basedirname, prefix);
}
private void setDirPrefix(HDFSPath basedirname, String prefix) {
basedir = basedirname;
this.prefix = prefix;
if (!basedir.exists()) {
log.fatal("Directory %s does not exists, please create", basedir.toString());
}
}
private void setDirPrefix(RRConfiguration conf) {
setDirPrefix(new HDFSPath(conf, conf.get("repository.dir", "")), conf.get("repository.prefix", ""));
}
/**
* Constructor to open an Repository with a fully read Configuration.
* Typically, this is used in MR classes, that get a Configuration object
* passed.
*
* @param conf
*/
public Repository(RRConfiguration conf) {
setDirPrefix(conf);
useConfiguration(conf);
readSettings();
}
/**
* Constructor to open a Repository using command line arguments. Typically
* this is done in non-MR classes. The environment should contain the
* necessary rr variables, and the first real argument should be the name of
* the configuration script that is read from rr.confdir.
*
* @param args
* @param template
*/
public Repository(String args[], String template) {
RRConfiguration conf = new RRConfiguration(args, template);
setDirPrefix(conf);
useConfiguration(conf);
readConfiguration();
readSettings();
}
public Repository(String args[]) {
this(args, "");
}
public Repository(org.apache.hadoop.conf.Configuration conf) {
this(RRConfiguration.convert(conf));
}
public void changeName(String newIndex) {
String dir = configuration.get("repository.dir").replaceAll(prefix, newIndex);
configuration.set("repository.dir", dir);
configuration.set("repository.prefix", newIndex);
basedir = new HDFSPath(configuration, configuration.get("repository.dir", ""));
prefix = newIndex;
}
public boolean exists() {
return basedir.exists();
}
public String getPrefix() {
return prefix;
}
public PartitionLocation getPartitionLocation() {
if (partitionlocation == null) {
partitionlocation = PartitionLocation.get(this);
}
return partitionlocation;
}
public String[] getPartitionLocation(int partition) {
return getPartitionLocation().read(partition);
}
public Repository(String conffile) {
this(new String[]{ conffile });
}
protected void useConfiguration(RRConfiguration conf) {
this.configuration = conf;
conf.setBoolean("fs.hdfs.impl.disable.cache", false);
setFileSystem(conf.FS());
}
protected void readSettings() {
partitions = configuredInt("repository.partitions", 1);
setVocabularySize(configuredInt("repository.vocabularysize", 0));
log.info("vocsize %d", configuredInt("repository.vocabularysize", 0));
setCF(configuredLong("repository.corpustf", 0));
documentcount = configuredInt("repository.documentcount", 0);
hashtablecapacity = configuredInt("repository.hashtablecapacity", 0);
getCollectionIDFeature();
}
protected void getStoredFeatures(String features[]) {
for (String s : features) {
StoredFeature f = (StoredFeature) getFeature(s);
}
}
public void setFileSystem(FileSystem fs) {
this.fs = fs;
}
public FileSystem getFS() {
return fs;
}
public HDFSPath getBaseDir() {
return basedir;
}
public HDFSPath getIndexDir() {
return (HDFSPath) getBaseDir().getSubdir("repository");
}
static Pattern pattern = Pattern.compile("(\\d{4})"); // segments limited to 10000
public static int getSegmentFromFilename(String filename) {
filename = filename.substring(filename.lastIndexOf('/') + 1);
Matcher matcher = pattern.matcher(filename);
if (matcher.find()) {
return Integer.parseInt(matcher.group());
}
return -1;
}
public String getFilename(String extension) {
return basedir.getFilename(prefix + extension);
}
public Datafile getMasterFile() {
return new Datafile(getFS(), basedir.getFilename(prefix + MASTERFILE_EXTENSION));
}
public String getTestsetName() {
return this.configuredString("testset.name");
}
protected Datafile getStoredValuesFile() {
return new Datafile(getFS(), basedir.getFilename(prefix + ".storedvalues"));
}
public long getCF() {
return cf;
}
public long getDocumentCount() {
return documentcount;
}
public void setDocumentCount(int documentcount) {
this.documentcount = documentcount;
}
public void setPartitions(int nodes) {
this.partitions = nodes;
}
public int getPartitions() {
return partitions;
}
/**
* stores the Repository data in a masterfile, so that it can be reopened
* with Repository.get()
*/
public void readConfiguration() {
Datafile df = getMasterFile();
Datafile storedvalues = this.getStoredValuesFile();
if (basedir.exists() && df.exists()) {
configuration.processConfigFile(df);
if (storedvalues.exists()) {
configuration.processConfigFile(storedvalues);
}
}
}
/**
* The configurationstring can contain settings
*
* @param configurationstring
*/
public void addConfiguration(String configurationstring) {
if (configurationstring != null) {
if (configurationstring.contains(",")) {
for (String s : configurationstring.split(",")) {
configuration.processScript(s);
}
} else {
configuration.processScript(configurationstring);
}
}
}
public void deleteMasterFile() {
Datafile df = getMasterFile();
if (df.exists()) {
df.delete();
}
}
private void initConfiguration() {
configuration.set("repository.dir", basedir.getCanonicalPath());
configuration.set("repository.prefix", prefix);
configuration.setInt("repository.partitions", partitions);
for (Map.Entry entry : storedfeaturesmap.entrySet()) {
StoredFeature f = entry.getValue();
configuration.addArray("repository.feature", f.getCanonicalName());
}
configuration.setLong("repository.vocabularysize", this.getVocabularySize());
configuration.setLong("repository.corpustf", this.getCF());
configuration.setLong("repository.documentcount", this.getDocumentCount());
}
public void writeConfiguration() {
initConfiguration();
Datafile masterfile = getMasterFile();
masterfile.openWrite();
configuration.writeString(masterfile, "repository.dir");
configuration.writeString(masterfile, "repository.prefix");
configuration.writeInt(masterfile, "repository.partitions");
for (Map.Entry entry : storedfeaturesmap.entrySet()) {
StoredFeature f = entry.getValue();
configuration.addArray("repository.feature", f.getCanonicalName());
}
configuration.writeStrings(masterfile, "repository.feature");
configuration.writeLong(masterfile, "repository.vocabularysize");
configuration.writeLong(masterfile, "repository.corpustf");
configuration.writeLong(masterfile, "repository.documentcount");
masterfile.closeWrite();
}
public Collection getConfiguredFeatures() {
getStoredFeatures(configuredStrings("repository.feature"));
return storedfeaturesmap.values();
}
public HashMap getConfiguredFeaturesMap() {
getStoredFeatures(configuredStrings("repository.feature"));
return storedfeaturesmap;
}
public void featuresWriteCache() {
for (StoredFeature f : getConfiguredFeatures()) {
f.writeCache();
}
}
public CollectionID getCollectionIDFeature() {
if (collectionid == null) {
String collidclassname = CollectionID.class.getSimpleName();
for (String f : configuredStrings("repository.feature")) {
if (f.startsWith(collidclassname)
&& (f.length() == collidclassname.length()
|| !Character.isLetter(f.charAt(collidclassname.length())))) {
collectionid = (CollectionID) getFeature(f);
break;
}
}
if (collectionid == null) {
collectionid = CollectionID.get(this);
}
}
return collectionid;
}
/**
* Use this method to obtain access to StoredFeatures, which allows the
* system to reuse single instances of the exact same feature.
*
* @param canonicalname
* @return a Feature instance identified by the canonicalname
*/
public Feature getFeature(String canonicalname) {
//log.info("getFeature( %s )", canonicalname);
Feature f = storedfeaturesmap.get(canonicalname);
if (f == null) {
String parts[] = canonicalname.split(":");
for (int i = 0; i < parts.length; i++) {
parts[i] = parts[i].trim();
}
String classname = stripPackageNames(parts[0], getClass().getPackage().getName(), Strategy.class.getPackage().getName());
switch (parts.length) {
case 1:
f = createFeature(classname);
break;
case 2:
f = createFeature(classname, parts[1]);
break;
case 3:
f = createFeature(classname, parts[1], parts[2]);
}
}
return f;
}
protected void storeFeature(String label, StoredFeature feature) {
//log.info("storeFeature %s %s", label, feature.getCanonicalName());
storedfeaturesmap.put(label, feature);
}
protected StoredFeature getStoredFeature(String label) {
//log.info("getStoredFeature %s", label);
return storedfeaturesmap.get(label);
}
private Feature createFeature(String classname, String... field) {
Feature f = null;
Method cons;
Class clazz = tryToClass(classname, getClass().getPackage().getName(), Strategy.class.getPackage().getName());
if (clazz != null) {
//log.info("createFeature %s %s", clazz.getSimpleName(), StrTools.concat(' ', field));
switch (field.length) {
case 0:
cons = tryGetMethod(clazz, "get", Repository.class);
f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this);
break;
case 1:
cons = tryGetMethod(clazz, "get", Repository.class, String.class);
f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this, field[0]);
break;
case 2:
cons = tryGetMethod(clazz, "get", Repository.class, String.class, String.class);
f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this, field[0], field[1]);
break;
}
}
return f;
}
public void unloadStoredDynamicFeatures() {
Iterator> iter = storedfeaturesmap.entrySet().iterator();
while (iter.hasNext()) {
Entry entry = iter.next();
if (entry.getValue() instanceof StoredDynamicFeature) {
iter.remove();
}
}
}
public void unloadTermDocumentFeatures() {
Iterator> iter = storedfeaturesmap.entrySet().iterator();
while (iter.hasNext()) {
Entry entry = iter.next();
if (entry.getValue() instanceof TermDocumentFeature) {
iter.remove();
}
}
}
public void unloadStoredDynamicFeature(Set sdf) {
for (StoredFeature f : sdf) {
unloadStoredDynamicFeature(f);
}
}
public void unloadStoredDynamicFeature(StoredFeature sdf) {
Iterator> iter = storedfeaturesmap.entrySet().iterator();
while (iter.hasNext()) {
Entry entry = iter.next();
if (entry.getValue() instanceof StoredFeature) {
if (entry.getValue() == sdf) {
iter.remove();
break;
}
}
}
}
public Integer termToID(String term) {
//log.info("termToID %s", term);
return TermID.get(this).get(term);
}
public void setVocabularySize(int size) {
vocabularysize = size;
//hashtablecapacity = calculateCapacity();
}
public int getVocabularySize() {
return vocabularysize;
}
public void setCF(long cf) {
this.cf = cf;
}
/**
* @return the Hadoop Configuration container that is used to maintain and
* communicate all settings for the repository
*/
public RRConfiguration getConf() {
return this.configuration;
}
/**
* @return the Hadoop Configuration container that is used to maintain and
* communicate all settings for the repository
*/
public String[] configuredStrings(String key) {
return configuration.getStrings(key);
}
public ArrayList configuredStringList(String key) {
return configuration.getStringList(key);
}
public ArrayList configuredIntList(String key) {
return configuration.getIntList(key);
}
public ArrayList configuredLongList(String key) {
return configuration.getLongList(key);
}
public String configuredString(String key) {
return configuration.get(key);
}
public String configurationName() {
return configuration.get("rr.conf");
}
public String configuredString(String key, String defaultvalue) {
return configuration.get(key, defaultvalue);
}
public int configuredInt(String key, int defaultvalue) {
return configuration.getInt(key, defaultvalue);
}
public int configuredInt(String key) {
return configuredInt(key, Integer.MIN_VALUE);
}
public long configuredLong(String key, long defaultvalue) {
return configuration.getLong(key, defaultvalue);
}
public long configuredLong(String key) {
return configuredLong(key, Long.MIN_VALUE);
}
/**
* @return the Hadoop Configuration container that is used to maintain and
* communicate all settings for the repository
*/
public boolean configuredBoolean(String key, boolean defaultvalue) {
return this.configuration.getBoolean(key, defaultvalue);
}
/**
* Note: Hadoop 0.20 does not support double, so these are stored as
* strings, if the value is not empty or a valid double a fatal exception is
* the result
*
* @return the double value of the key.
*/
public double configuredDouble(String key, double defaultvalue) {
double d = defaultvalue;
String value = this.configuration.get(key);
try {
if (value != null && value.length() > 0) {
d = Double.parseDouble(value);
}
} catch (NumberFormatException ex) {
log.fatalexception(ex, "Configuration setting '%s' does not contain a valid double '%s'", key, value);
}
return d;
}
public String getParameterFile() {
return configuredString("testset.queryparameters");
}
/**
* @return A {@link Datafile} that is configured in "testset.topics" as the
* file containing the topics for evaluation.
*/
public Datafile getTopicsFile() {
Datafile df = new Datafile(configuredString("rr.localdir") + "/" + configuredString("testset.topics"));
if (!df.exists()) {
df = new Datafile(getFS(),
configuredString("repository.dir") + "/" + configuredString("testset.topics"));
}
if (!df.exists()) {
log.fatal("topicfile %s does not exists", df.getCanonicalPath());
}
return df;
}
/**
* @return A list of {@link Datafile}s that is configured in "testset.qrels"
* as the files containing the query relevance labels for evaluation.
*/
public ArrayList getQrelFiles() throws IOException {
String qr = configuredString("testset.qrels");
if (qr == null) {
log.fatal("testset.qrels not set");
}
String qrs[] = qr.split(",");
ArrayList list = new ArrayList();
for (String p : qrs) {
Datafile f = new Datafile(configuredString("rr.localdir") + "/" + p);
Path d = f.getDir();
if (!d.exists()) {
f = new Datafile(getFS(), configuredString("repository.dir") + "/" + p);
d = f.getDir();
}
list.addAll(d.getFilesStartingWith(f.getFilename()));
}
return list;
}
public int[] tokenize(ExtractChannel attr) {
if (vocabulary == null || !(vocabulary instanceof VocabularyToIDRAM)) {
for (Feature f : this.getConfiguredFeatures()) {
if (f instanceof VocabularyToIDRAM) {
vocabulary = (VocabularyToIDRAM) f;
vocabulary.openRead();
//log.info("VocMem opened %s", vocabulary.getCanonicalName());
}
}
}
if (vocabulary == null) {
for (Feature f : this.getConfiguredFeatures()) {
if (f instanceof VocabularyToID) {
vocabulary = (VocabularyToID) f;
vocabulary.openRead();
//log.info("Voc opened %s", vocabulary.getCanonicalName());
}
}
}
if (vocabulary == null) {
log.fatal("you cannot tokenize if there is no Vocabulary in the repository");
}
return vocabulary.getContent(attr);
}
public int getPartition(String docid) {
return getPartition(docid, getPartitions());
}
public static int getPartition(String docid, int partitions) {
return MathTools.mod(docid.hashCode(), partitions);
}
public Repository[] getTuneRepositories() {
//log.info("crossevaluate %s", getConfigurationString("testset.crossevaluate"));
String cross[] = StrTools.split(configuredString("testset.crossevaluate"), ",");
if (cross.length == 1 && cross[0].equalsIgnoreCase("fold")) {
return new Repository[]{this};
}
Repository r[] = new Repository[cross.length + 1];
for (int i = 0; i < cross.length; i++) {
r[i + 1] = new Repository(cross[i]);
}
r[0] = this;
return r;
}
public String[] getStoredFreeParameters() {
String freeparameters[] = configuredStrings("strategy.freeparameter");
HashSet list = new HashSet();
for (int i = 0; i < freeparameters.length; i++) {
if (freeparameters[i].indexOf('=') > 0) {
list.add(freeparameters[i].substring(0, freeparameters[i].indexOf('=')).trim());
} else {
list.add(freeparameters[i].trim());
}
}
if (configuredString("testset.crossevaluate").equalsIgnoreCase("fold")) {
list.add("fold");
}
return list.toArray(new String[list.size()]);
}
public Map getFreeParameters() {
String freeparameters[] = configuredStrings("strategy.freeparameter");
HashMap tuneparameters = new HashMap();
for (String s : freeparameters) {
if (s.indexOf('=') > 0) {
String parameter = s.substring(0, s.indexOf('=')).trim();
String value = s.substring(s.indexOf('=') + 1).trim();
tuneparameters.put(parameter, value);
} else {
String value = configuredString(s);
tuneparameters.put(s, PrintTools.sprintf("%s..%s..%d", value, value, 1));
}
}
return tuneparameters;
}
HashSet stopwords;
public HashSet getStopwords() {
if (stopwords == null) {
StopwordsCache sw = StopwordsCache.get(this);
stopwords = sw.getStopwords();
if (stopwords.size() == 0) {
stopwords = StopWords.get(this).getIntSet();
}
}
return stopwords;
}
private ExtractorConf collectionExtractor;
public ExtractorConf getCollectionExtractor() {
if (collectionExtractor == null) {
collectionExtractor = new ExtractorConf(this.getConf());
}
return collectionExtractor;
}
public Term getTerm(String term) {
if (term == null) {
return null;
}
if (term.startsWith("@#")) {
int termid = Integer.parseInt(term);
if (termid < 0) {
return null;
}
TermString termstring = TermString.get(this);
String stemmed = termstring.readValue(termid);
return new Term(termid, null, stemmed, getStopwords().contains(termid));
} else {
if (term == null) {
return null;
}
String processedterm;
if (term.startsWith("@")) {
processedterm = term.substring(1);
term = null;
} else {
processedterm = englishStemmer.get().stem(term.toLowerCase());
}
int termid = termToID(processedterm);
return new Term(termid, term, processedterm, getStopwords().contains(termid));
}
}
public Term getProcessedTerm(String term) {
if (term == null) {
return null;
}
int termid = termToID(term);
return new Term(termid, null, term, getStopwords().contains(termid));
}
public Term getTerm(int termid) {
if (termid < 0) {
return null;
}
TermString termstring = TermString.get(this);
String stemmed = termstring.readValue(termid);
return new Term(termid, null, stemmed, getStopwords().contains(termid));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy