All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.github.repir.Retriever.Tuner.Retriever Maven / Gradle / Ivy

The newest version!
package io.github.repir.Retriever.Tuner;

import io.github.repir.Repository.ModelParameters;
import io.github.repir.Retriever.PostingIteratorReusable;
import java.io.IOException;
import io.github.repir.Repository.Repository;
import io.github.repir.Retriever.Query;
import io.github.repir.tools.lib.ArrayTools;
import io.github.repir.tools.lib.Log;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;

/**
 * Retriever that supports running the same model with several parameter
 * settings by reusing the loaded data.
 * 

* The result of the retrieval process is a combination of parameter settings * and the resulting mean average precision. These are stored in the * {@link ModelParameters} feature. *

* Two modes of operation are currently supported. (1) if * "testset.crossevaluate=fold", 10-fold cross evaluation is performed, by * dividing the set sequentially into 10 folds, and adding "fold=" to * the parameters that are stored with the results. (2) leave-a-testset-out if * "testset.crossevaluate" contains a (comma separated list of) other testsets. * Then the results for the entire set are scored per parameter setting, e.g. * "testset.crossevaluate=trec2kld,trec3kld". *

*/ public class Retriever extends io.github.repir.Retriever.Reusable.Retriever { public static Log log = new Log(Retriever.class); protected PostingIteratorReusable postingiterator; public Retriever(Repository repository) { super(repository); } public Retriever(Repository repository, org.apache.hadoop.mapreduce.Mapper.Context mappercontext) { super(repository, mappercontext); } public Retriever(Repository repository, org.apache.hadoop.mapreduce.Reducer.Context reducercontext) { super(repository, reducercontext); } @Override public Job createJob(String path) throws IOException { return new Job(this, path); } /** * @return a list of Variants to be tuned. These {@link Query.Variant} can be * added to the Query to tune. By default, only combinations of parameter * settings in configured parameter's range, that are not in ModelParameters, * are returned. By setting "tuner.overwrite=true", all variants within * parameter range are tried. */ public ArrayList getVariants() { ArrayList parameters = getParameters(); ArrayList variants = new ArrayList(); ArrayList settings = generatePoints(parameters); if (!repository.configuredBoolean("tuner.overwrite", false)) { settings = removeKnownSettings(repository, settings); } for (String conf : settings) { Query.Variant v = new Query.Variant(); v.configuration = conf; v.retrievalmodelclass = repository.configuredString("retriever.strategy"); v.scorefunctionclass = repository.configuredString("retriever.scorefunction"); variants.add(v); } return variants; } /** * @return a list of Parameters that have been configured as * "strategy.freeparameter". This implementation supports a grid search, thus * parameters will be configured with a range and step, e.g. * "+strategy.freeparameter=kld.mu=100..2500..100" will try settings 100, * 200, ..., 2500 for "kld.mu". */ public ArrayList getParameters() { ArrayList parameters = new ArrayList(); for (Map.Entry p : repository.getFreeParameters().entrySet()) { parameters.add(new ParameterGrid(p.getKey(), p.getValue())); } Collections.sort(parameters); for (int i = 0; i < parameters.size(); i++) { Parameter p = parameters.get(i); p.index = i; p.generatePoints(); } return parameters; } /** * returns a list of Strings with all possible combinations of parameter * settings within range. */ public ArrayList generatePoints(ArrayList parameters) { ArrayList settings = new ArrayList(); int parami[] = new int[parameters.size()]; for (int i = 0; i < parameters.size(); i++) { parami[i] = parameters.get(i).getPoints().size() - 1; } while (parami[0] >= 0) { settings.add(getSettings(parameters, parami)); for (int i = parameters.size() - 1; i >= 0; i--) { if (i < parameters.size() - 1) { parami[i + 1] = parameters.get(i + 1).getPoints().size() - 1; } if (--parami[i] >= 0) { break; } } } return settings; } private String getSettings(ArrayList parameters, int settings[]) { ArrayList list = new ArrayList(); for (Parameter p : parameters) { String pstr = p.parameter + "=" + p.getPoints().get(settings[p.index]).toString(); list.add(pstr); } return ArrayTools.toString(list, ","); } private ArrayList removeKnownSettings(Repository repository, ArrayList settings) { String[] storedparameters = repository.getStoredFreeParameters(); repository.getConf().setInt("fold", 0); // for if n-fold is used ModelParameters modelparameters = ModelParameters.get(repository, repository.configurationName()); modelparameters.setDataBufferSize(1000000); modelparameters.openRead(); Iterator iter = settings.iterator(); while (iter.hasNext()) { String s = iter.next(); repository.addConfiguration(s); ModelParameters.Record newRecord = modelparameters.newRecord(storedparameters); ModelParameters.Record found = modelparameters.read(newRecord); if (found != newRecord) { iter.remove(); } } return settings; } /** * Checks if the settings in ModelParameters are complete, i.e. are recorded for * all folds. You should not need this, it is only for testing.s */ public ArrayList removeKnownSettingsFold(Repository repository, ArrayList settings) { String[] storedparameters = repository.getStoredFreeParameters(); ModelParameters modelparameters = ModelParameters.get(repository, repository.configurationName()); modelparameters.setDataBufferSize(1000000); modelparameters.openRead(); Iterator iter = settings.iterator(); while (iter.hasNext()) { String s = iter.next(); repository.addConfiguration(s); boolean allthere = true; for (int i = 0; i < 10; i++) { repository.getConf().setInt("fold", i); ModelParameters.Record newRecord = modelparameters.newRecord(storedparameters); ModelParameters.Record found = modelparameters.read(newRecord); if (found == newRecord) { allthere = false; break; } } if (allthere) { iter.remove(); } } return settings; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy