All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.terrier.matching.FeaturedScoringMatching Maven / Gradle / Ivy

The newest version!
package org.terrier.matching;

import gnu.trove.TIntIntHashMap;

import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;

import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.learning.FeaturedResultSet;
import org.terrier.matching.dsms.DocumentScoreModifier;
import org.terrier.matching.matchops.UnorderedWindowOp;
import org.terrier.matching.models.WeightingModel;
import org.terrier.matching.models.WeightingModelFactory;
import org.terrier.sorting.MultiSort;
import org.terrier.structures.CollectionStatistics;
import org.terrier.structures.Index;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.Files;

public abstract class FeaturedScoringMatching extends FilterMatching {
	
	protected static Logger logger = LoggerFactory.getLogger(FeaturedScoringMatching.class);
	protected Index index;
	protected AbstractScoringMatching[] wModels;
	protected String[] wModelNames;
	protected DocumentScoreModifier[] dsms;
	protected String[] dsmNames;
	protected WeightingModel[] qiFeatures;
	protected String[] qiFeatureNames;
	protected boolean sampleFeature = false;
	
	protected Class scoringMatchingImpl;

	public FeaturedScoringMatching(Index _index, Matching _parent, String[] _featureNames,
			Class _scoringMatchingImpl) throws Exception
	{
		super(_parent);
		this.index = _index;
		this.scoringMatchingImpl = _scoringMatchingImpl;
		loadFeatures(_featureNames);	
	}
	
	public FeaturedScoringMatching(Index _index, Matching _parent,
			Class _scoringMatchingImpl) throws Exception
	{
		this(_index, _parent, getModelNames("fat.featured.scoring.matching.features"), _scoringMatchingImpl);
	}
	
	protected static String[] getModelNames(String property) throws Exception {
		return getModelNames(property, false);
	}
	protected static String[] getModelNames(String property, boolean optional) throws Exception {
		String[] modelNames = 
			ArrayUtils.parseDelimitedString(
					ApplicationSetup.getProperty(property, ""), ";");
		boolean file = modelNames.length == 1 && modelNames[0].equals("FILE");
		if (file)
		{
			String filename = ApplicationSetup.getProperty(property + ".file", null);
			if (filename == null)
				throw new IllegalArgumentException("For "+FatFeaturedScoringMatching.class+", property "+property+"file is not set");
			filename = ApplicationSetup.makeAbsolute(filename, ApplicationSetup.TERRIER_ETC);
			String line = null;
			final BufferedReader br = Files.openFileReader(filename);
			final List models = new ArrayList();
			while((line = br.readLine()) != null)
			{
				//ignore lines starting with comments
				if (line.startsWith("#"))
					continue;
				//remove trailing comments
				line = line.replaceAll("#.+$", "");
				//TREC-445: Empty line in feature definition file causes exception
				if (line.length() == 0) 
					continue;
				models.add(line.trim());
			}
			br.close();
			modelNames = models.toArray(new String[models.size()]);
		}
		//allow empty feature files
		if (! optional && modelNames.length == 0)
		{
			if (file)
				throw new IllegalArgumentException("No features found in file " +
					ApplicationSetup.getProperty(property + ".file", null) + " specified in property"  +
					property + ".file");
			
			
			throw new IllegalArgumentException("No features in property " + property);
		}
		return modelNames;
	}

	public static final Predicate>> getTagPredictate(final String matches) {
		return queryTerm -> queryTerm.getRight().contains(matches);
	}

	protected void loadFeatures(final String[] featureNames) throws Exception { 		
		final int featureCount = featureNames.length;
		
		final List _childrenWmodels = new ArrayList<>();
		final List _childrenWmodelNames = new ArrayList();
		
		final List _childrenQiModels = new ArrayList<>();
		final List _childrenQiNames = new ArrayList();
				
		final List _childrenDsms = new ArrayList<>();
		final List _childrenDsmNames = new ArrayList();
		
		for(int i=0;i 0) {
					String params = dsmName.substring(dsmName.indexOf("(")+1, dsmName.indexOf(")"));
					String[] parameters = params.split("\\s*,\\s*");
					
					dsm = ApplicationSetup.getClass(dsmName.substring(0,dsmName.indexOf("(")))
							.asSubclass(DocumentScoreModifier.class)
							.getConstructor(new Class[]{String[].class})
							.newInstance(new Object[]{parameters});
				}
				else{
					dsm = ApplicationSetup.getClass(dsmName).asSubclass(DocumentScoreModifier.class).newInstance();
				}
				_childrenDsms.add(dsm);
				_childrenDsmNames.add(featureNames[i]);
			}
			else if (featureNames[i].startsWith("QI:"))
			{
				final String qiName = featureNames[i].replaceFirst("QI:", "");
				final WeightingModel wm = WeightingModelFactory.newInstance(qiName);
				_childrenQiModels.add(wm);
				_childrenQiNames.add(featureNames[i]);
			}			
			else if (featureNames[i].startsWith("WMODEL") && featureNames[i].contains(":"))//assume WMODEL: WMODELp: WMODELt:
			{
				Predicate>> filter = null;
				final String[] parts = featureNames[i].split(":", 2);
				final String catchName = parts[0];
				final String wModelName = parts[1];
				if (catchName.startsWith("WMODEL$"))	
				{
					String requiredTag = catchName.split("\\$",2)[1];
					filter = getTagPredictate(requiredTag);
				}
				if (catchName.equals("WMODELp"))
					filter = filterProx;
				if (catchName.equals("WMODELt"))
					filter = filterTerm;
				if (catchName.equals("WMODELpuw"))
					filter = filterUW;
				if (catchName.equals("WMODELp1"))
					filter = filterOW;
				
				WeightingModel wm = WeightingModelFactory.newInstance(wModelName);
				AbstractScoringMatching fsm = scoringMatchingImpl
						.getConstructor(Index.class, Matching.class, WeightingModel.class, Predicate.class)
						.newInstance(null, parent, wm, filter);
				fsm.sort = false;
				_childrenWmodels.add(fsm);
				_childrenWmodelNames.add(featureNames[i]);
			} else {
				throw new IllegalArgumentException("invalid feature definition: " + featureNames[i]);
			}
								
		}		
		dsms = _childrenDsms.toArray(new DocumentScoreModifier[0]);
		dsmNames = _childrenDsmNames.toArray(new String[0]);
		
		qiFeatures = _childrenQiModels.toArray(new WeightingModel[0]);
		qiFeatureNames = _childrenQiNames.toArray(new String[0]);
		
		wModels = _childrenWmodels.toArray(new AbstractScoringMatching[0]);
		wModelNames = _childrenWmodelNames.toArray(new String[0]);
		
	}
	
	protected int applyDSMs(Index localIndex,  String queryNumber, MatchingQueryTerms mqtLocal, int numResults,  int[] inputDocids, short[] inputOccurrences, FeaturedResultSet rtr)
	{
		int featureCount = 0;
		TIntIntHashMap docidMap = new TIntIntHashMap(numResults);
		int position = 0;
		for(int docid : inputDocids)
		{
			docidMap.put(docid, position++);
		}
		for(int fid=0;fid>> filterUW = queryTerm -> queryTerm.getLeft().contains(UnorderedWindowOp.STRING_PREFIX);
	public static final Predicate>> filterOW = queryTerm -> queryTerm.getLeft().matches("^.*#\\d+.*$");
	public static final Predicate>> filterProx = filterUW.or(filterOW);
	public static final Predicate>> filterTerm = filterProx.negate();

	@Override
	public String getInfo() {
		return this.getClass().getSimpleName() +
			"["+ArrayUtils.join(wModelNames, ','+ArrayUtils.join(dsmNames, ','))+"]";
	}

	@Override
	public void setCollectionStatistics(CollectionStatistics cs) {
		throw new UnsupportedOperationException();
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy