All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.browseengine.bobo.facets.impl.CompactMultiValueFacetHandler Maven / Gradle / Ivy

There is a newer version: 3.1.2
Show newest version
/**
 * This software is licensed to you under the Apache License, Version 2.0 (the
 * "Apache License").
 *
 * LinkedIn's contributions are made under the Apache License. If you contribute
 * to the Software, the contributions will be deemed to have been made under the
 * Apache License, unless you expressly indicate otherwise. Please do not make any
 * contributions that would be inconsistent with the Apache License.
 *
 * You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, this software
 * distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
 * License for the specific language governing permissions and limitations for the
 * software governed under the Apache License.
 *
 * © 2012 LinkedIn Corp. All Rights Reserved.  
 */

package com.browseengine.bobo.facets.impl;

import it.unimi.dsi.fastutil.floats.FloatArrayList;
import it.unimi.dsi.fastutil.floats.FloatList;
import it.unimi.dsi.fastutil.ints.IntArrayList;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.log4j.Logger;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ScoreDoc;

import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.api.BrowseFacet;
import com.browseengine.bobo.api.BrowseSelection;
import com.browseengine.bobo.api.FacetIterator;
import com.browseengine.bobo.api.FacetSpec;
import com.browseengine.bobo.facets.FacetCountCollector;
import com.browseengine.bobo.facets.FacetCountCollectorSource;
import com.browseengine.bobo.facets.FacetHandler;
import com.browseengine.bobo.facets.data.FacetDataCache;
import com.browseengine.bobo.facets.data.TermListFactory;
import com.browseengine.bobo.facets.data.TermStringList;
import com.browseengine.bobo.facets.data.TermValueList;
import com.browseengine.bobo.facets.filter.CompactMultiValueFacetFilter;
import com.browseengine.bobo.facets.filter.EmptyFilter;
import com.browseengine.bobo.facets.filter.RandomAccessAndFilter;
import com.browseengine.bobo.facets.filter.RandomAccessFilter;
import com.browseengine.bobo.facets.filter.RandomAccessNotFilter;
import com.browseengine.bobo.query.scoring.BoboDocScorer;
import com.browseengine.bobo.query.scoring.FacetScoreable;
import com.browseengine.bobo.query.scoring.FacetTermScoringFunctionFactory;
import com.browseengine.bobo.sort.DocComparator;
import com.browseengine.bobo.sort.DocComparatorSource;
import com.browseengine.bobo.util.BigIntArray;
import com.browseengine.bobo.util.BigSegmentedArray;
import com.browseengine.bobo.util.StringArrayComparator;

public class CompactMultiValueFacetHandler extends FacetHandler implements FacetScoreable
{
	private static Logger logger = Logger.getLogger(CompactMultiValueFacetHandler.class);
	
	private static final int MAX_VAL_COUNT = 32;
	private final TermListFactory _termListFactory;
	private final String _indexFieldName;

	public CompactMultiValueFacetHandler(String name,String indexFieldName,TermListFactory termListFactory) {
		super(name);
		_indexFieldName=indexFieldName;
		_termListFactory = termListFactory;
	}
	
	public CompactMultiValueFacetHandler(String name,TermListFactory termListFactory) {
      this(name,name,termListFactory);
    }
	
	public CompactMultiValueFacetHandler(String name,String indexFieldName) {
		this(name,indexFieldName,null);
	}
	
	public CompactMultiValueFacetHandler(String name) {
      this(name,name,null);
    }
	
	@Override
	public DocComparatorSource getDocComparatorSource() {
		return new CompactMultiFacetDocComparatorSource(this);
	}
	
  @Override
  public RandomAccessFilter buildRandomAccessFilter(String value,Properties prop) throws IOException
  {
      return new CompactMultiValueFacetFilter(this, value);
  }

  @Override
  public RandomAccessFilter buildRandomAccessAndFilter(String[] vals,Properties prop) throws IOException
  {
    ArrayList filterList = new ArrayList(vals.length);
    
    for (String val : vals)
    {
      RandomAccessFilter f = buildRandomAccessFilter(val, prop);
      if(f != null) 
      {
        filterList.add(f); 
      }
      else
      {
        return EmptyFilter.getInstance();
      }
    }
    if (filterList.size() == 1) return filterList.get(0);
    return new RandomAccessAndFilter(filterList);
  }
  
  @Override
  public RandomAccessFilter buildRandomAccessOrFilter(String[] vals,Properties prop,boolean isNot) throws IOException
  {
    RandomAccessFilter filter = null;
    
    if(vals.length > 0)
    {
      filter = new CompactMultiValueFacetFilter(this,vals);
    }
    else
    {
      filter = EmptyFilter.getInstance();
    }
    if (isNot)
    {
      filter = new RandomAccessNotFilter(filter);
    }
    return filter;
  }
  
  private static int countBits(int val){
	  int c=0;
	  for (c=0;val>0;c++){
		  val &= val-1;
	  }
	  return c;
  }
  
  @Override
  public int getNumItems(BoboIndexReader reader, int id) {
	FacetDataCache dataCache = getFacetData(reader);
	if (dataCache==null) return 0;
	int encoded=dataCache.orderArray.get(id);
	return countBits(encoded);
  }

  @Override
	public String[] getFieldValues(BoboIndexReader reader,int id) {
	  FacetDataCache dataCache = getFacetData(reader);
	  if (dataCache==null) return new String[0];
		int encoded=dataCache.orderArray.get(id);
		if (encoded==0) {
			return new String[]{""};
		}
		else{
			int count=1;
			ArrayList valList=new ArrayList(MAX_VAL_COUNT);
			
			while(encoded != 0)
			{
				if ((encoded & 0x00000001) != 0x0){
					valList.add(dataCache.valArray.get(count));
				}
				count++;
				encoded >>>= 1;
			}
			return valList.toArray(new String[valList.size()]);
		}
	}

    @Override
	public Object[] getRawFieldValues(BoboIndexReader reader,int id){
    	FacetDataCache dataCache = getFacetData(reader);
    	if (dataCache==null) return new String[0];
    	int encoded=dataCache.orderArray.get(id);
		if (encoded==0) {
			return new Object[0];
		}
		else{
			int count=1;
			ArrayList valList=new ArrayList(MAX_VAL_COUNT);
			
			while(encoded != 0)
			{
				if ((encoded & 0x00000001) != 0x0){
					valList.add(dataCache.valArray.getRawValue(count));
				}
				count++;
				encoded >>>= 1;
			}
			return valList.toArray(new Object[valList.size()]);
		}
	}
    
	@Override
	public FacetCountCollectorSource getFacetCountCollectorSource(final BrowseSelection sel,final FacetSpec ospec) {
		return new FacetCountCollectorSource() {
			
			@Override
			public FacetCountCollector getFacetCountCollector(BoboIndexReader reader,
					int docBase) {
				final FacetDataCache dataCache = CompactMultiValueFacetHandler.this.getFacetData(reader);
				return new CompactMultiValueFacetCountCollector(_name,sel,dataCache,docBase,ospec);
			}
		};
	}

	@Override
	public FacetDataCache load(BoboIndexReader reader) throws IOException {
		int maxDoc = reader.maxDoc();

		BigIntArray order = new BigIntArray(maxDoc);

		TermValueList mterms = _termListFactory == null ? new TermStringList() : _termListFactory.createTermList();
		
		IntArrayList minIDList=new IntArrayList();
	    IntArrayList maxIDList=new IntArrayList();
	    IntArrayList freqList = new IntArrayList();
	    
		TermDocs termDocs = null;
		TermEnum termEnum = null;
		int t = 0; // current term number
		mterms.add(null);
		minIDList.add(-1);
	    maxIDList.add(-1);
	    freqList.add(0);
		t++;
		try {
			termDocs = reader.termDocs();
			termEnum = reader.terms(new Term(_indexFieldName, ""));
			do {
				if (termEnum == null)
					break;
				Term term = termEnum.term();
				if (term == null || !_indexFieldName.equals(term.field()))
					break;

				// store term text
				// we expect that there is at most one term per document
				if (t > MAX_VAL_COUNT) {
					throw new IOException("maximum number of value cannot exceed: "
							+ MAX_VAL_COUNT);
				}
				String val = term.text();
				mterms.add(val);
				int bit = (0x00000001 << (t-1));
				termDocs.seek(termEnum);
				//freqList.add(termEnum.docFreq());  // removed because the df doesn't take into account the num of deletedDocs
				int df = 0;
				int minID=-1;
		        int maxID=-1;
		        if(termDocs.next())
		        {
		          df++;
                  int docid = termDocs.doc();
                  order.add(docid, order.get(docid) | bit);
                  minID = docid;
                  while (termDocs.next())
		          {
                    df++;
                    docid = termDocs.doc();
                    order.add(docid, order.get(docid) | bit);
		          }
				  maxID = docid;
		        }
	            freqList.add(df);
				minIDList.add(minID);
		        maxIDList.add(maxID);
				t++;
			} while (termEnum.next());
		} finally {
			try {
				if (termDocs != null) {
					termDocs.close();
				}
			} finally {
				if (termEnum != null) {
					termEnum.close();
				}
			}
		}
		
		mterms.seal();

		return new FacetDataCache(order,mterms,freqList.toIntArray(),minIDList.toIntArray(),maxIDList.toIntArray(),TermCountSize.large);
	}
	
	private static class CompactMultiFacetDocComparatorSource extends DocComparatorSource{
		private final CompactMultiValueFacetHandler _facetHandler;
        private CompactMultiFacetDocComparatorSource(CompactMultiValueFacetHandler facetHandler){
        	_facetHandler = facetHandler;
        }

		@Override
		public DocComparator getComparator(final IndexReader reader, int docbase)
				throws IOException {
			if (!(reader instanceof BoboIndexReader))
				throw new IllegalStateException("reader must be instance of "+BoboIndexReader.class);
			final FacetDataCache dataCache = _facetHandler.getFacetData((BoboIndexReader)reader);
			return new DocComparator(){
				@Override
				public int compare(ScoreDoc doc1, ScoreDoc doc2) {
					int encoded1=dataCache.orderArray.get(doc1.doc);
					int encoded2=dataCache.orderArray.get(doc2.doc);
					return encoded1-encoded2;
				}

				@Override
				public Comparable value(ScoreDoc doc) {
					return new StringArrayComparator(_facetHandler.getFieldValues((BoboIndexReader)reader,doc.doc));
				}
				
			};
		}	
	}
	
	public BoboDocScorer getDocScorer(BoboIndexReader reader,FacetTermScoringFunctionFactory scoringFunctionFactory,Map boostMap){
		FacetDataCache dataCache = getFacetData(reader);
		float[] boostList = BoboDocScorer.buildBoostList(dataCache.valArray, boostMap);
		return new CompactMultiValueDocScorer(dataCache,scoringFunctionFactory,boostList);
	}
	
	private static final class CompactMultiValueDocScorer extends BoboDocScorer{
		private final FacetDataCache _dataCache;
		CompactMultiValueDocScorer(FacetDataCache dataCache,FacetTermScoringFunctionFactory scoreFunctionFactory,float[] boostList){
			super(scoreFunctionFactory.getFacetTermScoringFunction(dataCache.valArray.size(), dataCache.orderArray.size()),boostList);
			_dataCache = dataCache;
		}
		
		@Override
		public Explanation explain(int doc){
			int encoded=_dataCache.orderArray.get(doc);
			
			int count=1;
			FloatList scoreList = new FloatArrayList(_dataCache.valArray.size());
			ArrayList explList = new ArrayList(scoreList.size());
			while(encoded != 0)
			{
				if ((encoded & 0x00000001) != 0x0){
					int idx = count -1;
					scoreList.add(_function.score(_dataCache.freqs[idx], _boostList[idx]));
					explList.add(_function.explain(_dataCache.freqs[idx], _boostList[idx]));
				}
				count++;
				encoded >>>= 1;
			}
			Explanation topLevel = _function.explain(scoreList.toFloatArray());
			for (Explanation sub : explList){
				topLevel.addDetail(sub);
			}
			return topLevel;
		}
		
		@Override
		public final float score(int docid) {
			_function.clearScores();
			int encoded=_dataCache.orderArray.get(docid);
			
			int count=1;
			
			while(encoded != 0)
			{
				int idx = count -1;
				if ((encoded & 0x00000001) != 0x0){	
					_function.scoreAndCollect(_dataCache.freqs[idx], _boostList[idx]);
				}
				count++;
				encoded >>>= 1;
			}
			return _function.getCurrentScore();
		}
		
	}

	private static final class CompactMultiValueFacetCountCollector extends DefaultFacetCountCollector
	{
	  private final BigSegmentedArray _array;
	  private final int[] _combinationCount = new int[16 * 8];
	  private int _noValCount = 0;
	  private boolean _aggregated = false;
	  
	  
	  CompactMultiValueFacetCountCollector(String name,BrowseSelection sel,
	                                       FacetDataCache dataCache,int docBase,
	                                       FacetSpec ospec)
	                                       {
	    super(name,dataCache,docBase,sel,ospec);
	    _array = _dataCache.orderArray;
	  }
	  

	  @Override
	  public final void collectAll()
	  {
	    _count = BigIntArray.fromArray(_dataCache.freqs);
	    _aggregated = true;
	  }
	  
	  @Override
      public final void collect(int docid)
	  {
	    int encoded = _array.get(docid);
	    if(encoded == 0)
	    {
	      _noValCount++;
	    }
	    else
	    {
	      int offset = 0;
	      while(true)
	      {
	        _combinationCount[(encoded & 0x0F) + offset]++;
	        encoded = (encoded >>> 4);
	        if (encoded == 0) break;
	        offset += 16;
	      }
	    }
      }
	
	  @Override
	  public BrowseFacet getFacet(String value)
	  {
	    if(!_aggregated) aggregateCounts();
	    return super.getFacet(value);  
    }

    @Override
    public int getFacetHitsCount(Object value) 
    {
	    if(!_aggregated) aggregateCounts();
	    return super.getFacetHitsCount(value);  
    }
      @Override
      public BigSegmentedArray getCountDistribution()
      {
        if(!_aggregated) aggregateCounts();
        return _count;
      }
      
      @Override
      public List getFacets()
      {
        if(!_aggregated) aggregateCounts();
        return super.getFacets();
      }
      
      private void aggregateCounts()
      {
        _count.add(0, _noValCount);
        
        for(int i = 1; i < _combinationCount.length; i++)
        {
          int count = _combinationCount[i];
          if(count > 0)
          {
            int offset = (i >> 4) * 4;
            int encoded = (i & 0x0F);
            int index = 1;
            while(encoded != 0)
            {
              if ((encoded & 0x00000001) != 0x0)
              {
                int idx = index+offset;
                _count.add(idx, _count.get(idx) + count);
              }
              index++;
              encoded >>>= 1;
            }
          }
        }
        _aggregated = true;
      }

      @Override
      public FacetIterator iterator() {
          if(!_aggregated) aggregateCounts();
    	  return super.iterator();
      }
	}
}