All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.openminted.uc.socialsciences.ner.util.MyIobEncoder Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2014
 * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
 * Technische Universität Darmstadt
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package eu.openminted.uc.socialsciences.ner.util;

import java.util.Collection;
import java.util.Map;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.CasUtil;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;

/**
 * Converts a chunk annotations into IOB2-style.
 * When doing the encoding, it is assumed that there is one custom type with two
 * features. Example: Type 'Named Entity' with the features 'value' and
 * 'modifier'. The 'modifier' feature is expected to be used to create sub
 * types. For example, you may have an annotation with value "LOC" and modifier
 * "city", and another annotation with value "LOC" and modifier "country".
 * The encoder can produce either coarse-grained or fine-grained annotations, so
 * in the example either "B-LOC" for both annotations, or "B-LOCcity" and
 * "B-LOCcountry" which can then be treated as different classes in training a
 * model with these annotations.
 */
public class MyIobEncoder {

	private static final Logger logger = LogManager.getLogger(MyIobEncoder.class);

	private Int2ObjectMap iobBeginMap;
	private Int2ObjectMap iobInsideMap;

	/**
	 * Constructor.
	 *
	 * @param aCas
	 *            The CAS.
	 * @param aType
	 *            The Type of annotation that should be encoded in IOB.
	 * @param aValueFeature
	 *            The Feature for the type that carries the annotation value.
	 * @param aModifierFeature
	 *            The Feature for the type that carries some additional info.
	 * @param useSubTypes
	 *            If true, {aModifierFeatue} will be included in the output,
	 *            otherwise it will be ignored.
	 */
	public MyIobEncoder(CAS aCas, Type aType, Feature aValueFeature, Feature aModifierFeature, boolean useSubTypes) {
		// fill map for whole JCas in order to efficiently encode IOB
		iobBeginMap = new Int2ObjectOpenHashMap<>();
		iobInsideMap = new Int2ObjectOpenHashMap<>();

		Map> coveringNeIdx = CasUtil.indexCovering(aCas, aType, aType);
		Map> tokenIdx = CasUtil.indexCovered(aCas, aType,
				CasUtil.getType(aCas, Token.class));

		nextChunk: for (AnnotationFS chunk : CasUtil.select(aCas, aType)) {
			String value = chunk.getStringValue(aValueFeature);
			String modifier = chunk.getStringValue(aModifierFeature);
			logger.debug(String.format("Annotation: '%s' (%d:%d)", chunk.getCoveredText(), chunk.getBegin(),
					chunk.getEnd()));
			logger.debug("Value: " + chunk.getStringValue(aValueFeature));
			logger.debug("Modifier: " + chunk.getStringValue(aModifierFeature));

			if (value == null)
				continue;
			String label = useSubTypes && modifier != null ? value + modifier : value;

			if (coveringNeIdx.containsKey(chunk)) {
				// there are covering annotations, so don't include this one
				continue nextChunk;
			}

			for (AnnotationFS token : tokenIdx.get(chunk)) {
				if (token.getBegin() == chunk.getBegin()) {
					if (iobInsideMap.containsKey(token.getBegin())) {
						continue nextChunk;
					}
					iobBeginMap.put(token.getBegin(), label);
				} else if (token.getBegin() > chunk.getBegin()){
					iobInsideMap.put(token.getBegin(), label);
				}
			}
		}
	}

	/**
	 * Returns the IOB tag for a given token.
	 *
	 * @param token
	 *            a token.
	 * @return the IOB tag.
	 */
	public String encode(Token token) {
		if (iobBeginMap.containsKey(token.getBegin())) {
			return "B-" + iobBeginMap.get(token.getBegin());
		}

		if (iobInsideMap.containsKey(token.getBegin())) {
			return "I-" + iobInsideMap.get(token.getBegin());
		}

		return "O";
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy