org.dkpro.tc.features.tcu.TcuLookUpTable Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright 2019
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universität Darmstadt
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package org.dkpro.tc.features.tcu;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;

import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.dkpro.tc.api.exception.TextClassificationException;
import org.dkpro.tc.api.features.Feature;
import org.dkpro.tc.api.features.FeatureExtractor;
import org.dkpro.tc.api.features.FeatureExtractorResource_ImplBase;
import org.dkpro.tc.api.type.JCasId;
import org.dkpro.tc.api.type.TextClassificationSequence;
import org.dkpro.tc.api.type.TextClassificationTarget;

/**
 * Provides speedy access to the TextClassificationUnits (TCU) covered by a
 * TextClassificationSequence. Enables faster access to the previous/next TCU The look-up tables
 * provided here are build for each new sequence.
 */
public class TcuLookUpTable
    extends FeatureExtractorResource_ImplBase
    implements FeatureExtractor
{
    private String lastSeenDocumentId = "";

    protected HashMap idx2SequenceBegin = new HashMap();
    protected HashMap idx2SequenceEnd = new HashMap();

    protected HashMap begin2target = new HashMap();
    protected HashMap targetBegin2Idx = new HashMap();
    protected HashMap targetEnd2Idx = new HashMap();
    protected List units = new ArrayList();

    public Set extract(JCas aJCas, TextClassificationTarget aTarget)
        throws TextClassificationException
    {
        if (isTheSameDocument(aJCas)) {
            return null;
        }
        begin2target = new HashMap();
        targetBegin2Idx = new HashMap();
        idx2SequenceBegin = new HashMap();
        idx2SequenceEnd = new HashMap();
        units = new ArrayList();

        int i = 0;
        for (TextClassificationTarget t : JCasUtil.select(aJCas, TextClassificationTarget.class)) {
            Integer begin = t.getBegin();
            Integer end = t.getEnd();
            begin2target.put(begin, t);
            targetBegin2Idx.put(begin, i);
            targetEnd2Idx.put(end, i);
            units.add(t);
            i++;
        }
        for (TextClassificationSequence sequence : JCasUtil.select(aJCas,
                TextClassificationSequence.class)) {
            Integer begin = sequence.getBegin();
            Integer end = sequence.getEnd();
            Integer idxStartUnit = targetBegin2Idx.get(begin);
            Integer idxEndUnit = targetEnd2Idx.get(end);
            idx2SequenceBegin.put(idxStartUnit, true);
            idx2SequenceEnd.put(idxEndUnit, true);
        }
        return null;
    }

    private boolean isTheSameDocument(JCas aJCas)
    {
        JCasId casId = JCasUtil.selectSingle(aJCas, JCasId.class);
        String currentId = casId.getId() + "";
        boolean isSame = currentId.equals(lastSeenDocumentId);
        lastSeenDocumentId = currentId;
        return isSame;
    }

}