org.apache.tika.eval.tokens.TokenCounter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.eval.tokens;
import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.apache.commons.math3.util.FastMath;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.tika.eval.textstats.TokenCountPriorityQueue;
/**
* @deprecated use {@link org.apache.tika.eval.textstats.CompositeTextStatsCalculator}
* with {@link org.apache.tika.eval.textstats.TokenEntropy}, {@link org.apache.tika.eval.textstats.TokenLengths}
* and {@link org.apache.tika.eval.textstats.TopNTokens}.
*/
@Deprecated
public class TokenCounter {
Map> map = new HashMap<>(); //Map>
Map tokenStatistics = new HashMap<>();
private final TokenStatistics NULL_TOKEN_STAT = new TokenStatistics(
0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics());
private final Analyzer generalAnalyzer;
private int topN = 10;
public TokenCounter(Analyzer generalAnalyzer) throws IOException {
this.generalAnalyzer = generalAnalyzer;
}
public void add(String field, String content) throws IOException {
_add(field, generalAnalyzer, content);
}
private void _add(String field, Analyzer analyzer, String content) throws IOException {
int totalTokens = 0;
TokenStream ts = analyzer.tokenStream(field, content);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
ts.reset();
Map tokenMap = map.get(field);
if (tokenMap == null) {
tokenMap = new HashMap<>();
map.put(field, tokenMap);
}
while (ts.incrementToken()) {
String token = termAtt.toString();
MutableInt cnt = tokenMap.get(token);
if (cnt == null) {
cnt = new MutableInt(1);
tokenMap.put(token, cnt);
} else {
cnt.increment();
}
totalTokens++;
}
ts.close();
ts.end();
int totalUniqueTokens = tokenMap.size();
double ent = 0.0d;
double p = 0.0d;
double base = 2.0;
TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
SummaryStatistics summaryStatistics = new SummaryStatistics();
for (Map.Entry e : tokenMap.entrySet()) {
String token = e.getKey();
int termFreq = e.getValue().intValue();
p = (double) termFreq / (double) totalTokens;
ent += p * FastMath.log(base, p);
int len = token.codePointCount(0, token.length());
for (int i = 0; i < e.getValue().intValue(); i++) {
summaryStatistics.addValue(len);
}
if (queue.top() == null || queue.size() < topN ||
termFreq >= queue.top().getValue()) {
queue.insertWithOverflow(new TokenIntPair(token, termFreq));
}
}
if (totalTokens > 0) {
ent = (-1.0d / (double)totalTokens) * ent;
}
/* Collections.sort(allTokens);
List topNList = new ArrayList<>(topN);
for (int i = 0; i < topN && i < allTokens.size(); i++) {
topNList.add(allTokens.get(i));
}*/
tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens,
queue.getArray(), ent, summaryStatistics));
}
public TokenStatistics getTokenStatistics(String field) {
TokenStatistics tokenStat = tokenStatistics.get(field);
if (tokenStat == null) {
return NULL_TOKEN_STAT;
}
return tokenStat;
}
public void setTopN(int topN) {
this.topN = topN;
}
public void clear(String field) {
Map tokenMap = map.get(field);
if (tokenMap != null) {
tokenMap.clear();
}
tokenStatistics.put(field, NULL_TOKEN_STAT);
}
public Map getTokens(String field) {
Map ret = map.get(field);
if (ret == null) {
return Collections.emptyMap();
}
return ret;
}
}