![JAR search and dependency download from the Maven repository](/logo.png)
eu.project.ttc.models.index.TermMeasure Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of termsuite-core Show documentation
Show all versions of termsuite-core Show documentation
A Java UIMA-based toolbox for multilingual and efficient terminology extraction an multilingual term alignment
package eu.project.ttc.models.index;
import java.util.Comparator;
import com.google.common.base.Preconditions;
import com.google.common.collect.ComparisonChain;
import eu.project.ttc.models.Term;
import eu.project.ttc.models.TermIndex;
import eu.project.ttc.models.TermOccurrence;
public abstract class TermMeasure {
private static final String MSG_NOT_COMPUTED = "This term measure must be computed first.";
private TermIndex termIndex;
private boolean computed = false;
public TermMeasure(TermIndex termIndex) {
super();
this.termIndex = termIndex;
}
private int totalSpottedTerms = 0;
private double sum = 0d;
private double min = Double.MAX_VALUE;
private double max = Double.MIN_VALUE;
private double avg = Double.NaN;
private double standardDev = Double.NaN;
public abstract double getValue(Term term);
private void checkComputed() {
Preconditions.checkState(computed, MSG_NOT_COMPUTED);
}
public void compute() {
Preconditions.checkState(termIndex.getTerms().size() > 0, "Cannot compute a measure ono an empty TermIndex");
this.sum = 0d;
int num = 0;
for(Term t:termIndex.getTerms()) {
num++;
if(getValue(t) < this.min)
this.min = getValue(t);
if(getValue(t) > this.max)
this.max = getValue(t);
this.sum += getValue(t);
}
/*
* WARNING!
*
* If totalSpottedTerms is not set, we set it as num, but
* aggregated measure will be impacted and no normalization
* will actually work. Normalization should always be operated
* over the same spotting rules, without prior filtering.
*/
if(termIndex.getSpottedTermsNum() > 0)
totalSpottedTerms = termIndex.getSpottedTermsNum();
else
totalSpottedTerms = num;
this.avg = this.sum / this.totalSpottedTerms;
// compute standard deviation
double sigmaSquare = 0;
for(Term term:termIndex.getTerms())
sigmaSquare+=Math.pow(getValue(term) - this.avg, 2);
this.standardDev = Math.sqrt(1.0/this.totalSpottedTerms * sigmaSquare);
this.computed = true;
}
public double getStandardDeviation() {
checkComputed();
return this.standardDev;
}
public double getZScore(Term t) {
checkComputed();
return (getValue(t) - getAvg())/getStandardDeviation();
}
public double getMin() {
checkComputed();
return min;
}
public double getMax() {
checkComputed();
return max;
}
public double getAvg() {
checkComputed();
return avg;
}
public double getSum() {
checkComputed();
return sum;
}
public int getTotalSpottedTerms() {
return totalSpottedTerms;
}
public Comparator getTermComparator(final boolean reverse) {
return new Comparator() {
@Override
public int compare(Term o1, Term o2) {
return ComparisonChain.start()
.compare(reverse ? getValue(o2) : getValue(o1), reverse ? getValue(o1) : getValue(o2))
.compare(o1.getGroupingKey(), o2.getGroupingKey())
.result();
}
};
}
public Comparator super TermOccurrence> getOccurrenceComparator(final boolean reverse) {
return new Comparator() {
@Override
public int compare(TermOccurrence o1, TermOccurrence o2) {
return ComparisonChain.start()
.compare(reverse ? getValue(o2.getTerm()) : getValue(o1.getTerm()), reverse ? getValue(o1.getTerm()) : getValue(o2.getTerm()))
.compare(o1.getSourceDocument().getUrl(), o2.getSourceDocument().getUrl())
.compare(o1.getBegin(), o2.getBegin())
.compare(o2.getEnd(), o1.getEnd())
.result();
}
};
}
@Override
public String toString() {
return String.format("avg: %.2f - sigma: %.2f [min: %.2f, max: %.2f, num: %d]",
(float)getAvg(),
(float)getStandardDeviation(),
(float)getMin(),
(float)getMax(),
getTotalSpottedTerms()
);
}
public boolean isComputed() {
return this.computed;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy