All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.browseengine.bobo.facets.statistics.FacetCountStatisicsGenerator Maven / Gradle / Ivy
/**
* This software is licensed to you under the Apache License, Version 2.0 (the
* "Apache License").
*
* LinkedIn's contributions are made under the Apache License. If you contribute
* to the Software, the contributions will be deemed to have been made under the
* Apache License, unless you expressly indicate otherwise. Please do not make any
* contributions that would be inconsistent with the Apache License.
*
* You may obtain a copy of the Apache License at http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, this software
* distributed under the Apache License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the Apache
* License for the specific language governing permissions and limitations for the
* software governed under the Apache License.
*
* © 2012 LinkedIn Corp. All Rights Reserved.
*/
package com.browseengine.bobo.facets.statistics;
import java.io.File;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.browseengine.bobo.api.BoboBrowser;
import com.browseengine.bobo.api.BoboIndexReader;
import com.browseengine.bobo.api.BrowseFacet;
import com.browseengine.bobo.api.BrowseRequest;
import com.browseengine.bobo.api.BrowseResult;
import com.browseengine.bobo.api.FacetAccessible;
import com.browseengine.bobo.api.FacetSpec;
import com.browseengine.bobo.api.FacetSpec.FacetSortSpec;
import com.browseengine.bobo.facets.FacetCountCollector;
import com.browseengine.bobo.util.BigIntArray;
public abstract class FacetCountStatisicsGenerator
{
private int _minCount = 1;
public int getMinCount()
{
return _minCount;
}
public void setMinCount(int minCount)
{
_minCount = minCount;
}
public abstract double calculateDistributionScore(int[] distribution,int collectedSampleCount,int numSamplesCollected,int totalSamplesCount);
public FacetCountStatistics generateStatistic(int[] distribution,int n)
{
int[] tmp=distribution;
int totalSampleCount=distribution.length;
boolean sorted=false;
if (n>0)
{
totalSampleCount = Math.min(n, tmp.length);
// this is crappy, to be made better with a pq
int[] tmp2 = new int[distribution.length];
System.arraycopy(distribution, 0, tmp2, 0, distribution.length);
Arrays.sort(tmp2);
tmp = new int[totalSampleCount];
System.arraycopy(tmp2, 0, tmp, 0, tmp.length);
sorted = true;
}
int collectedSampleCount = 0;
int numSamplesCollected = 0;
for (int count : tmp)
{
if (count >= _minCount)
{
collectedSampleCount+=count;
numSamplesCollected++;
}
else
{
if (sorted) break;
}
}
double distScore = calculateDistributionScore(tmp, collectedSampleCount, numSamplesCollected,totalSampleCount);
FacetCountStatistics stats = new FacetCountStatistics();
stats.setDistribution(distScore);
stats.setNumSamplesCollected(numSamplesCollected);
stats.setCollectedSampleCount(collectedSampleCount);
stats.setTotalSampleCount(totalSampleCount);
return stats;
}
public FacetCountStatistics generateStatistic(FacetCountCollector countHitCollector,int n)
{
return generateStatistic(BigIntArray.toArray(countHitCollector.getCountDistribution()),n);
}
public static void main(String[] args) throws Exception
{
Directory idxDir = FSDirectory.open(new File("/Users/jwang/dataset/facet_idx_2/beef"));
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT,"b",new StandardAnalyzer(Version.LUCENE_CURRENT));
String q = "pc:yahoo";
Query query = qp.parse(q);
BrowseRequest req = new BrowseRequest();
req.setQuery(query);
FacetSpec fspec = new FacetSpec();
fspec.setExpandSelection(true);
fspec.setMaxCount(5);
fspec.setOrderBy(FacetSortSpec.OrderHitsDesc);
req.setFacetSpec("ccid", fspec);
req.setFacetSpec("pcid", fspec);
req.setFacetSpec("education_id", fspec);
req.setFacetSpec("geo_region", fspec);
req.setFacetSpec("geo_country", fspec);
req.setFacetSpec("industry", fspec);
req.setFacetSpec("proposal_accepts", fspec);
req.setFacetSpec("num_endorsers", fspec);
req.setFacetSpec("group_id", fspec);
BoboIndexReader reader = BoboIndexReader.getInstance(IndexReader.open(idxDir));
BoboBrowser browser = new BoboBrowser(reader);
BrowseResult res = browser.browse(req);
Map facetMap = res.getFacetMap();
Collection facetCountCollectors = facetMap.values();
Iterator iter = facetCountCollectors.iterator();
while (iter.hasNext())
{
FacetAccessible f = iter.next();
if (f instanceof FacetCountCollector)
{
System.out.println("====================================");
FacetCountCollector fc = (FacetCountCollector)f;
int[] dist = BigIntArray.toArray(fc.getCountDistribution());
if (dist!=null)
{
ChiSquaredFacetCountStatisticsGenerator gen = new ChiSquaredFacetCountStatisticsGenerator();
gen.setMinCount(0);
FacetCountStatistics stats = gen.generateStatistic(dist, 0);
System.out.println("stat for field "+fc.getName()+": "+stats);
System.out.println("Centered distribution score: " + (stats.getDistribution()-(double)(stats.getNumSamplesCollected()-1))/Math.sqrt((2.0*(double)(stats.getNumSamplesCollected()-1))));
System.out.println("........................");
List facetList = fc.getFacets();
System.out.println(facetList);
System.out.println("........................");
}
System.out.println("====================================");
}
}
reader.close();
}
}