org.dkpro.tc.features.tcu.TargetSurfaceFormContextFeature Maven / Gradle / Ivy
/*******************************************************************************
* Copyright 2018
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.dkpro.tc.features.tcu;
import java.util.Set;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureType;
import org.dkpro.tc.api.type.TextClassificationTarget;
/**
* A feature that allows to provide the context surface text as feature value. This feature does
* only work with machine classifier that can deal with string feature values
*/
@TypeCapability(inputs = { "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence",
"de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token" })
public class TargetSurfaceFormContextFeature
extends TcuLookUpTable
{
public static final String PARAM_LOWER_CASE = "useLowerCase";
@ConfigurationParameter(name = PARAM_LOWER_CASE, mandatory = true, defaultValue = "true")
protected boolean useLowerCase;
/**
* Specifies which target's text is set as feature value for the current target. If zero is
* provided the surface form of the current target is set as feature value. By setting positive
* or negative indices the local context can be set as feature i.e. -1 will set the text of the
* previous target annotation that occurs before the annotation for which the feature extractor
* is currently executed. Likewise +1 will set the text of the following target annotation.
*/
public static final String PARAM_RELATIVE_TARGET_ANNOTATION_INDEX = "targetAnnotation";
@ConfigurationParameter(name = PARAM_RELATIVE_TARGET_ANNOTATION_INDEX, mandatory = true)
protected Integer shiftIdx;
public static final String FEATURE_NAME = "context_";
final static String END_OF_SEQUENCE = "EOS";
static final String BEG_OF_SEQUENCE = "BOS";
static final String OUT_OF_BOUNDARY = "OOB";
public Set extract(JCas aView, TextClassificationTarget unit)
throws TextClassificationException
{
super.extract(aView, unit);
Integer currentTargetIdx = super.unitBegin2Idx.get(unit.getBegin());
Integer targetIdx = currentTargetIdx + shiftIdx;
String featureVal = getTargetText(targetIdx);
return new Feature(FEATURE_NAME + toHumanReadable(shiftIdx), featureVal,
FeatureType.NUMERIC).asSet();
}
private String toHumanReadable(Integer shiftIdx)
{
if (shiftIdx < 0) {
return "minus" + Math.abs(shiftIdx);
}
else if (shiftIdx > 0) {
return "plus" + Math.abs(shiftIdx);
}
return "current";
}
private String lowerCase(String token)
{
if (useLowerCase) {
return token.toLowerCase();
}
return token;
}
private String getTargetText(Integer idx)
{
if (idx == -1) {
return BEG_OF_SEQUENCE;
}
else if (idx < -1) {
return OUT_OF_BOUNDARY;
}
else if (idx == units.size()) {
return END_OF_SEQUENCE;
}
else if (idx > units.size()) {
return OUT_OF_BOUNDARY;
}
return lowerCase(units.get(idx).getCoveredText());
}
}