All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Repository.Repository Maven / Gradle / Ivy

The newest version!
package io.github.repir.Repository;

import io.github.repir.tools.extract.ExtractChannel;
import io.github.repir.Repository.Stopwords.StopWords;
import io.github.repir.Repository.Stopwords.StopwordsCache;
import io.github.repir.Strategy.Strategy;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.io.Path;
import io.github.repir.tools.io.HDFSPath;
import io.github.repir.tools.lib.ArrayTools;
import static io.github.repir.tools.lib.ClassTools.*;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.lib.MathTools;
import io.github.repir.tools.lib.PrintTools;
import io.github.repir.tools.lib.StrTools;
import io.github.repir.MapReduceTools.RRConfiguration;
import io.github.repir.tools.extract.ExtractorConf;
import io.github.repir.tools.Words.englishStemmer;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.FileSystem;

/**
 * The Repository manages all persistent data for a collection : features that
 * were extracted from the collection, configuration settings and additional
 * data files.
 * 

* The Repository is THE central component in RepIR. Each collection is * converted into its own Repository of extracted features. The extraction * process and the StoredFeatures can be tailor made, programs can obtain * low-level access to the stored data, new features can be added and * StoredDynamicFeatures can be used as small size tables to store data that can * be modified. *

* The configuration of a Repository, and the communication of settings for * tasks is done through an extension of Hadoop's Configuration class, of which * a single instance resides in the Repository, that can be accessed using * {@link #getConf()}, {@link #configuredString(java.lang.String)}, * etc. The Configuration settings are usually seeded through configuration * files, but can also be added through the command line or from code. *

* For low level access to a {@link StoredFeature}, you should obtain the * feature through the Repository with {@link #getFeature(java.lang.Class, java.lang.String[]) * } * using the feature's Class, and optional parameters. */ public class Repository { public static Log log = new Log(Repository.class); protected HDFSPath basedir; // dir on HDFS containing the repository files protected String prefix; // prefix for every file, usually configname protected FileSystem fs = null; // leave null for local FS, otherwise use HDFS protected static final String MASTERFILE_EXTENSION = ".master"; protected long documentcount; // number of documents in collection public static final double DEFAULT_LOAD_FACTOR = 0.75; protected int hashtablecapacity; // for the vocabulary hashtable protected int vocabularysize; // number of words in vocabulary protected long cf; // number of words in collection protected VocabularyToID vocabulary; protected int partitions = 1; // number of partitions the repository is divided in protected PartitionLocation partitionlocation; // gives fast access to the man location of each patition public HashMap storedfeaturesmap = new HashMap(); private CollectionID collectionid; protected RRConfiguration configuration = new RRConfiguration(); /** * Constructor for the creation of a new Repository with * {@link VocabularyBuilder}. *

* @param basedirname directory where the repository is stored * @param prefix prefix for all repository filenames (usually the repository * name) */ public Repository(HDFSPath basedirname, String prefix) { setDirPrefix(basedirname, prefix); } private void setDirPrefix(HDFSPath basedirname, String prefix) { basedir = basedirname; this.prefix = prefix; if (!basedir.exists()) { log.fatal("Directory %s does not exists, please create", basedir.toString()); } } private void setDirPrefix(RRConfiguration conf) { setDirPrefix(new HDFSPath(conf, conf.get("repository.dir", "")), conf.get("repository.prefix", "")); } /** * Constructor to open an Repository with a fully read Configuration. * Typically, this is used in MR classes, that get a Configuration object * passed. *

* @param conf */ public Repository(RRConfiguration conf) { setDirPrefix(conf); useConfiguration(conf); readSettings(); } /** * Constructor to open a Repository using command line arguments. Typically * this is done in non-MR classes. The environment should contain the * necessary rr variables, and the first real argument should be the name of * the configuration script that is read from rr.confdir. * * @param args * @param template */ public Repository(String args[], String template) { RRConfiguration conf = new RRConfiguration(args, template); setDirPrefix(conf); useConfiguration(conf); readConfiguration(); readSettings(); } public Repository(String args[]) { this(args, ""); } public Repository(org.apache.hadoop.conf.Configuration conf) { this(RRConfiguration.convert(conf)); } public void changeName(String newIndex) { String dir = configuration.get("repository.dir").replaceAll(prefix, newIndex); configuration.set("repository.dir", dir); configuration.set("repository.prefix", newIndex); basedir = new HDFSPath(configuration, configuration.get("repository.dir", "")); prefix = newIndex; } public boolean exists() { return basedir.exists(); } public String getPrefix() { return prefix; } public PartitionLocation getPartitionLocation() { if (partitionlocation == null) { partitionlocation = PartitionLocation.get(this); } return partitionlocation; } public String[] getPartitionLocation(int partition) { return getPartitionLocation().read(partition); } public Repository(String conffile) { this(new String[]{ conffile }); } protected void useConfiguration(RRConfiguration conf) { this.configuration = conf; conf.setBoolean("fs.hdfs.impl.disable.cache", false); setFileSystem(conf.FS()); } protected void readSettings() { partitions = configuredInt("repository.partitions", 1); setVocabularySize(configuredInt("repository.vocabularysize", 0)); log.info("vocsize %d", configuredInt("repository.vocabularysize", 0)); setCF(configuredLong("repository.corpustf", 0)); documentcount = configuredInt("repository.documentcount", 0); hashtablecapacity = configuredInt("repository.hashtablecapacity", 0); getCollectionIDFeature(); } protected void getStoredFeatures(String features[]) { for (String s : features) { StoredFeature f = (StoredFeature) getFeature(s); } } public void setFileSystem(FileSystem fs) { this.fs = fs; } public FileSystem getFS() { return fs; } public HDFSPath getBaseDir() { return basedir; } public HDFSPath getIndexDir() { return (HDFSPath) getBaseDir().getSubdir("repository"); } static Pattern pattern = Pattern.compile("(\\d{4})"); // segments limited to 10000 public static int getSegmentFromFilename(String filename) { filename = filename.substring(filename.lastIndexOf('/') + 1); Matcher matcher = pattern.matcher(filename); if (matcher.find()) { return Integer.parseInt(matcher.group()); } return -1; } public String getFilename(String extension) { return basedir.getFilename(prefix + extension); } public Datafile getMasterFile() { return new Datafile(getFS(), basedir.getFilename(prefix + MASTERFILE_EXTENSION)); } public String getTestsetName() { return this.configuredString("testset.name"); } protected Datafile getStoredValuesFile() { return new Datafile(getFS(), basedir.getFilename(prefix + ".storedvalues")); } public long getCF() { return cf; } public long getDocumentCount() { return documentcount; } public void setDocumentCount(int documentcount) { this.documentcount = documentcount; } public void setPartitions(int nodes) { this.partitions = nodes; } public int getPartitions() { return partitions; } /** * stores the Repository data in a masterfile, so that it can be reopened * with Repository.get() */ public void readConfiguration() { Datafile df = getMasterFile(); Datafile storedvalues = this.getStoredValuesFile(); if (basedir.exists() && df.exists()) { configuration.processConfigFile(df); if (storedvalues.exists()) { configuration.processConfigFile(storedvalues); } } } /** * The configurationstring can contain settings * * @param configurationstring */ public void addConfiguration(String configurationstring) { if (configurationstring != null) { if (configurationstring.contains(",")) { for (String s : configurationstring.split(",")) { configuration.processScript(s); } } else { configuration.processScript(configurationstring); } } } public void deleteMasterFile() { Datafile df = getMasterFile(); if (df.exists()) { df.delete(); } } private void initConfiguration() { configuration.set("repository.dir", basedir.getCanonicalPath()); configuration.set("repository.prefix", prefix); configuration.setInt("repository.partitions", partitions); for (Map.Entry entry : storedfeaturesmap.entrySet()) { StoredFeature f = entry.getValue(); configuration.addArray("repository.feature", f.getCanonicalName()); } configuration.setLong("repository.vocabularysize", this.getVocabularySize()); configuration.setLong("repository.corpustf", this.getCF()); configuration.setLong("repository.documentcount", this.getDocumentCount()); } public void writeConfiguration() { initConfiguration(); Datafile masterfile = getMasterFile(); masterfile.openWrite(); configuration.writeString(masterfile, "repository.dir"); configuration.writeString(masterfile, "repository.prefix"); configuration.writeInt(masterfile, "repository.partitions"); for (Map.Entry entry : storedfeaturesmap.entrySet()) { StoredFeature f = entry.getValue(); configuration.addArray("repository.feature", f.getCanonicalName()); } configuration.writeStrings(masterfile, "repository.feature"); configuration.writeLong(masterfile, "repository.vocabularysize"); configuration.writeLong(masterfile, "repository.corpustf"); configuration.writeLong(masterfile, "repository.documentcount"); masterfile.closeWrite(); } public Collection getConfiguredFeatures() { getStoredFeatures(configuredStrings("repository.feature")); return storedfeaturesmap.values(); } public HashMap getConfiguredFeaturesMap() { getStoredFeatures(configuredStrings("repository.feature")); return storedfeaturesmap; } public void featuresWriteCache() { for (StoredFeature f : getConfiguredFeatures()) { f.writeCache(); } } public CollectionID getCollectionIDFeature() { if (collectionid == null) { String collidclassname = CollectionID.class.getSimpleName(); for (String f : configuredStrings("repository.feature")) { if (f.startsWith(collidclassname) && (f.length() == collidclassname.length() || !Character.isLetter(f.charAt(collidclassname.length())))) { collectionid = (CollectionID) getFeature(f); break; } } if (collectionid == null) { collectionid = CollectionID.get(this); } } return collectionid; } /** * Use this method to obtain access to StoredFeatures, which allows the * system to reuse single instances of the exact same feature. * * @param canonicalname * @return a Feature instance identified by the canonicalname */ public Feature getFeature(String canonicalname) { //log.info("getFeature( %s )", canonicalname); Feature f = storedfeaturesmap.get(canonicalname); if (f == null) { String parts[] = canonicalname.split(":"); for (int i = 0; i < parts.length; i++) { parts[i] = parts[i].trim(); } String classname = stripPackageNames(parts[0], getClass().getPackage().getName(), Strategy.class.getPackage().getName()); switch (parts.length) { case 1: f = createFeature(classname); break; case 2: f = createFeature(classname, parts[1]); break; case 3: f = createFeature(classname, parts[1], parts[2]); } } return f; } protected void storeFeature(String label, StoredFeature feature) { //log.info("storeFeature %s %s", label, feature.getCanonicalName()); storedfeaturesmap.put(label, feature); } protected StoredFeature getStoredFeature(String label) { //log.info("getStoredFeature %s", label); return storedfeaturesmap.get(label); } private Feature createFeature(String classname, String... field) { Feature f = null; Method cons; Class clazz = tryToClass(classname, getClass().getPackage().getName(), Strategy.class.getPackage().getName()); if (clazz != null) { //log.info("createFeature %s %s", clazz.getSimpleName(), StrTools.concat(' ', field)); switch (field.length) { case 0: cons = tryGetMethod(clazz, "get", Repository.class); f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this); break; case 1: cons = tryGetMethod(clazz, "get", Repository.class, String.class); f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this, field[0]); break; case 2: cons = tryGetMethod(clazz, "get", Repository.class, String.class, String.class); f = (Feature) io.github.repir.tools.lib.ClassTools.invoke(cons, null, this, field[0], field[1]); break; } } return f; } public void unloadStoredDynamicFeatures() { Iterator> iter = storedfeaturesmap.entrySet().iterator(); while (iter.hasNext()) { Entry entry = iter.next(); if (entry.getValue() instanceof StoredDynamicFeature) { iter.remove(); } } } public void unloadTermDocumentFeatures() { Iterator> iter = storedfeaturesmap.entrySet().iterator(); while (iter.hasNext()) { Entry entry = iter.next(); if (entry.getValue() instanceof TermDocumentFeature) { iter.remove(); } } } public void unloadStoredDynamicFeature(Set sdf) { for (StoredFeature f : sdf) { unloadStoredDynamicFeature(f); } } public void unloadStoredDynamicFeature(StoredFeature sdf) { Iterator> iter = storedfeaturesmap.entrySet().iterator(); while (iter.hasNext()) { Entry entry = iter.next(); if (entry.getValue() instanceof StoredFeature) { if (entry.getValue() == sdf) { iter.remove(); break; } } } } public Integer termToID(String term) { //log.info("termToID %s", term); return TermID.get(this).get(term); } public void setVocabularySize(int size) { vocabularysize = size; //hashtablecapacity = calculateCapacity(); } public int getVocabularySize() { return vocabularysize; } public void setCF(long cf) { this.cf = cf; } /** * @return the Hadoop Configuration container that is used to maintain and * communicate all settings for the repository */ public RRConfiguration getConf() { return this.configuration; } /** * @return the Hadoop Configuration container that is used to maintain and * communicate all settings for the repository */ public String[] configuredStrings(String key) { return configuration.getStrings(key); } public ArrayList configuredStringList(String key) { return configuration.getStringList(key); } public ArrayList configuredIntList(String key) { return configuration.getIntList(key); } public ArrayList configuredLongList(String key) { return configuration.getLongList(key); } public String configuredString(String key) { return configuration.get(key); } public String configurationName() { return configuration.get("rr.conf"); } public String configuredString(String key, String defaultvalue) { return configuration.get(key, defaultvalue); } public int configuredInt(String key, int defaultvalue) { return configuration.getInt(key, defaultvalue); } public int configuredInt(String key) { return configuredInt(key, Integer.MIN_VALUE); } public long configuredLong(String key, long defaultvalue) { return configuration.getLong(key, defaultvalue); } public long configuredLong(String key) { return configuredLong(key, Long.MIN_VALUE); } /** * @return the Hadoop Configuration container that is used to maintain and * communicate all settings for the repository */ public boolean configuredBoolean(String key, boolean defaultvalue) { return this.configuration.getBoolean(key, defaultvalue); } /** * Note: Hadoop 0.20 does not support double, so these are stored as * strings, if the value is not empty or a valid double a fatal exception is * the result *

* @return the double value of the key. */ public double configuredDouble(String key, double defaultvalue) { double d = defaultvalue; String value = this.configuration.get(key); try { if (value != null && value.length() > 0) { d = Double.parseDouble(value); } } catch (NumberFormatException ex) { log.fatalexception(ex, "Configuration setting '%s' does not contain a valid double '%s'", key, value); } return d; } public String getParameterFile() { return configuredString("testset.queryparameters"); } /** * @return A {@link Datafile} that is configured in "testset.topics" as the * file containing the topics for evaluation. */ public Datafile getTopicsFile() { Datafile df = new Datafile(configuredString("rr.localdir") + "/" + configuredString("testset.topics")); if (!df.exists()) { df = new Datafile(getFS(), configuredString("repository.dir") + "/" + configuredString("testset.topics")); } if (!df.exists()) { log.fatal("topicfile %s does not exists", df.getCanonicalPath()); } return df; } /** * @return A list of {@link Datafile}s that is configured in "testset.qrels" * as the files containing the query relevance labels for evaluation. */ public ArrayList getQrelFiles() throws IOException { String qr = configuredString("testset.qrels"); if (qr == null) { log.fatal("testset.qrels not set"); } String qrs[] = qr.split(","); ArrayList list = new ArrayList(); for (String p : qrs) { Datafile f = new Datafile(configuredString("rr.localdir") + "/" + p); Path d = f.getDir(); if (!d.exists()) { f = new Datafile(getFS(), configuredString("repository.dir") + "/" + p); d = f.getDir(); } list.addAll(d.getFilesStartingWith(f.getFilename())); } return list; } public int[] tokenize(ExtractChannel attr) { if (vocabulary == null || !(vocabulary instanceof VocabularyToIDRAM)) { for (Feature f : this.getConfiguredFeatures()) { if (f instanceof VocabularyToIDRAM) { vocabulary = (VocabularyToIDRAM) f; vocabulary.openRead(); //log.info("VocMem opened %s", vocabulary.getCanonicalName()); } } } if (vocabulary == null) { for (Feature f : this.getConfiguredFeatures()) { if (f instanceof VocabularyToID) { vocabulary = (VocabularyToID) f; vocabulary.openRead(); //log.info("Voc opened %s", vocabulary.getCanonicalName()); } } } if (vocabulary == null) { log.fatal("you cannot tokenize if there is no Vocabulary in the repository"); } return vocabulary.getContent(attr); } public int getPartition(String docid) { return getPartition(docid, getPartitions()); } public static int getPartition(String docid, int partitions) { return MathTools.mod(docid.hashCode(), partitions); } public Repository[] getTuneRepositories() { //log.info("crossevaluate %s", getConfigurationString("testset.crossevaluate")); String cross[] = StrTools.split(configuredString("testset.crossevaluate"), ","); if (cross.length == 1 && cross[0].equalsIgnoreCase("fold")) { return new Repository[]{this}; } Repository r[] = new Repository[cross.length + 1]; for (int i = 0; i < cross.length; i++) { r[i + 1] = new Repository(cross[i]); } r[0] = this; return r; } public String[] getStoredFreeParameters() { String freeparameters[] = configuredStrings("strategy.freeparameter"); HashSet list = new HashSet(); for (int i = 0; i < freeparameters.length; i++) { if (freeparameters[i].indexOf('=') > 0) { list.add(freeparameters[i].substring(0, freeparameters[i].indexOf('=')).trim()); } else { list.add(freeparameters[i].trim()); } } if (configuredString("testset.crossevaluate").equalsIgnoreCase("fold")) { list.add("fold"); } return list.toArray(new String[list.size()]); } public Map getFreeParameters() { String freeparameters[] = configuredStrings("strategy.freeparameter"); HashMap tuneparameters = new HashMap(); for (String s : freeparameters) { if (s.indexOf('=') > 0) { String parameter = s.substring(0, s.indexOf('=')).trim(); String value = s.substring(s.indexOf('=') + 1).trim(); tuneparameters.put(parameter, value); } else { String value = configuredString(s); tuneparameters.put(s, PrintTools.sprintf("%s..%s..%d", value, value, 1)); } } return tuneparameters; } HashSet stopwords; public HashSet getStopwords() { if (stopwords == null) { StopwordsCache sw = StopwordsCache.get(this); stopwords = sw.getStopwords(); if (stopwords.size() == 0) { stopwords = StopWords.get(this).getIntSet(); } } return stopwords; } private ExtractorConf collectionExtractor; public ExtractorConf getCollectionExtractor() { if (collectionExtractor == null) { collectionExtractor = new ExtractorConf(this.getConf()); } return collectionExtractor; } public Term getTerm(String term) { if (term == null) { return null; } if (term.startsWith("@#")) { int termid = Integer.parseInt(term); if (termid < 0) { return null; } TermString termstring = TermString.get(this); String stemmed = termstring.readValue(termid); return new Term(termid, null, stemmed, getStopwords().contains(termid)); } else { if (term == null) { return null; } String processedterm; if (term.startsWith("@")) { processedterm = term.substring(1); term = null; } else { processedterm = englishStemmer.get().stem(term.toLowerCase()); } int termid = termToID(processedterm); return new Term(termid, term, processedterm, getStopwords().contains(termid)); } } public Term getProcessedTerm(String term) { if (term == null) { return null; } int termid = termToID(term); return new Term(termid, null, term, getStopwords().contains(termid)); } public Term getTerm(int termid) { if (termid < 0) { return null; } TermString termstring = TermString.get(this); String stemmed = termstring.readValue(termid); return new Term(termid, null, stemmed, getStopwords().contains(termid)); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy