org.metafacture.statistics.AbstractCountProcessor Maven / Gradle / Ivy
/*
* Copyright 2013, 2014 Deutsche Nationalbibliothek
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.statistics;
import java.util.Hashtable;
import java.util.Map;
import java.util.regex.Pattern;
import org.metafacture.framework.ObjectReceiver;
import org.metafacture.framework.helpers.DefaultObjectPipe;
import org.metafacture.framework.objects.Triple;
/**
* Base class for operating on count data. The expected inputs are triples
* containing as subject the variable name and as object the count. Marginal
* counts must appear first, joint counts second. Marinal counts must be written
* as 1:VARNAME, Joint counts as 2:FIRSTVARNAME&SECONDVARNAME.
*
* @author Markus Michael Geipel
*/
public abstract class AbstractCountProcessor extends DefaultObjectPipe> {
private static final Pattern KEY_SPLIT_PATTERN = Pattern.compile("&", Pattern.LITERAL);
private static final String MARGINAL_PREFIX = "1:";
private static final String JOINT_PREFIX = "2:";
private final Map marginals = new Hashtable();
private boolean inHeader = true;
private int minCount;
protected final int getTotal() {
return getMarginal("");
}
protected final void setMinCount(final int min) {
minCount = min;
}
@Override
public final void process(final Triple triple) {
if (triple.getSubject().indexOf('&') == -1) {
if (!inHeader) {
throw new IllegalArgumentException(
"Marginal counts and joint count must not be mixed. Marginal counts must appear first, joint counts second");
}
if (!triple.getSubject().startsWith(MARGINAL_PREFIX)) {
throw new IllegalArgumentException("Marginal counts must start with '1:'");
}
final int marginal = Integer.parseInt(triple.getObject());
if (marginal >= minCount) {
marginals.put(triple.getSubject().substring(2), Integer.valueOf(marginal));
}
} else {
inHeader = false;
if (!triple.getSubject().startsWith(JOINT_PREFIX)) {
throw new IllegalArgumentException("Joint counts must start with '2:'");
}
final int nab = Integer.parseInt(triple.getObject());
final String[] keyParts = KEY_SPLIT_PATTERN.split(triple.getSubject().substring(2));
if (nab >= minCount) {
final int na = getMarginal(keyParts[0]);
final int nb = getMarginal(keyParts[1]);
processCount(keyParts[0], keyParts[1], na, nb, nab);
}
}
}
protected abstract void processCount(final String varA, final String varB, final int countA, final int countB,
final int countAandB);
private int getMarginal(final String string) {
final Integer value = marginals.get(string);
if(null==value){
return 0;
}
return value.intValue();
}
@Override
protected final void onResetStream() {
marginals.clear();
inHeader = true;
reset();
}
protected void reset() {
// nothing to do
}
@Override
protected final void onCloseStream() {
onResetStream();
close();
}
protected void close() {
// nothing to do
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy