All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.ctakes.assertion.cr.GoldEntityAndAttributeReader Maven / Gradle / Ivy

There is a newer version: 6.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.assertion.cr;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.jdom.Document;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;

import org.apache.ctakes.typesystem.type.constants.CONST;
import org.apache.ctakes.typesystem.type.textsem.EntityMention;
import org.apache.ctakes.typesystem.type.textsem.EventMention;
import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.core.util.Mapper;

/**
 * Read named entity annotations from knowtator xml files into the CAS
 * 
 * @author stephen wu
 *
 */
public class GoldEntityAndAttributeReader extends JCasAnnotator_ImplBase {

	// paramater that should contain the path to knowtator xml files
	public static final String PARAM_INPUTDIR = "InputDirectory";
	// path to knowtator xml files
	public static String inputDirectory;
	// counter for assigning entity ids
	public int identifiedAnnotationId;
	private boolean VERBOSE = true;
	
	@Override
	public void initialize(UimaContext aContext) throws ResourceInitializationException {
		super.initialize(aContext);
		
		inputDirectory = (String)aContext.getConfigParameterValue(PARAM_INPUTDIR);
		identifiedAnnotationId = 0;
	}

	@Override
	public void process(JCas jCas) throws AnalysisEngineProcessException {

			JCas initView;
      try {
        initView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
      } catch (CASException e) {
        throw new AnalysisEngineProcessException(e);
      } 
			String goldFilePath = inputDirectory + DocumentIDAnnotationUtil.getDocumentID(jCas) + ".knowtator.xml";
			
      SAXBuilder builder = new SAXBuilder();
      Document document;
      try {
        document = builder.build(new File(goldFilePath));
      } catch (JDOMException e) {
        throw new AnalysisEngineProcessException(e);
      } catch (Exception e) { // TODO this should be IOException, but the command-line maven build was breaking
        throw new AnalysisEngineProcessException(e);
      }

      // map knowtator mention ids to entity offsets
      HashMap> allMentions = XMLReader.getEntityMentions(document);
      // map knowtator mention ids to entity types
      HashMap entityTypes = XMLReader.getEntityTypes(document);
      // map knowtator mention ids to the ids of mention-attributes (or attributes themselves)
      HashMap> mentionAttr = XMLReader.getEntityAttributes(document);
      // map knowtator mention-attribute ids to attributes
      String[] complexSlotMention = {"complexSlotMention"};
      HashMap attrPtr = XMLReader.getAttributes(document,complexSlotMention);
      // map knowtator attribute ids to role-value pairs
      HashMap attrs = XMLReader.getAttributes(document);


      System.out.println("What's in attrPtr -- the mention-attribute ids to attributes....");
      for (Entry e : attrPtr.entrySet()) {
    	  System.out.println("attrPtr: " + e.getKey() + " with role " + e.getValue().role + " and value " + e.getValue().value);
      }

      System.out.println("\nWhat's in attrs -- the knowtator attribute ids....");
      for (Entry e : attrs.entrySet()) {
    	  System.out.println("attrs: " + e.getKey() + " with role " + e.getValue().role + " and value " + e.getValue().value);
      }
      
      // pare down hashmap based on types -- keep only NEs
      HashMap> neMentions = filterToNamedEntitiesOnly(allMentions,entityTypes);

      for(Map.Entry> mention : neMentions.entrySet()) {
    	  String mentionId = mention.getKey();
    	  
//    	  // pare down what to consider -- keep only valid NEs, discard modifiers
//    	  if (!filterToNamedEntitiesOnly(allMentions,
//    			  mentionId,entityTypes.get(mentionId))) {
//    		  continue;
//    	  }
    	  
    	  Span first = null;
    	  Span last = null;
    	  // for disjoint spans, just ignore the gap
    	  first = mention.getValue().get(0);
    	  last = mention.getValue().get(mention.getValue().size() - 1);

    	  // put entity and attributes into the CAS
    	  // choose either entity or event
    	  IdentifiedAnnotation eMention;
    	  int type = Mapper.getEntityTypeId(entityTypes.get(mentionId));
    	  if (type==CONST.NE_TYPE_ID_ANATOMICAL_SITE) {
    		  eMention = new EntityMention(initView, first.start, last.end);  
    	  } else if (type==CONST.NE_TYPE_ID_DISORDER
    			  || type==CONST.NE_TYPE_ID_DRUG
    			  || type==CONST.NE_TYPE_ID_FINDING
    			  || type==CONST.NE_TYPE_ID_PROCEDURE
    			  || type==CONST.NE_TYPE_ID_ANATOMICAL_SITE
    			  ) {
    		  eMention = new EventMention(initView, first.start, last.end);
    	  } else {
    		  eMention = new IdentifiedAnnotation(initView, first.start, last.end);
    	  }
    	  
    	  // set easy attributes
    	  eMention.setTypeID(Mapper.getEntityTypeId(entityTypes.get(mentionId)));
    	  eMention.setId(identifiedAnnotationId++);
    	  eMention.setDiscoveryTechnique(CONST.NE_DISCOVERY_TECH_GOLD_ANNOTATION);
    	  eMention.setConfidence(1);

    	  if (mentionId.endsWith("4351")) {
    		  System.out.println();
    	  }
    	  
          List assocAttributes = getLeafAttributes(mentionId,
        		  mentionAttr,attrPtr,attrs,new ArrayList());
          
          for (ArgumentInfo a : assocAttributes) {

//    	  // set harder attributes from cas -- look through all attribute ids attached to this mentionId
//    	  for (String attrId : mentionAttr.get(mentionId) ) {
//    		  // make sure this attribute was actually somewhere in the knowtator file
//    		  if (!attrs.containsKey(attrId)) {
//    			  if (VERBOSE) { System.err.println("WARNING: attribute not found: "+attrId); }
//    			  continue;
//    		  }
    			
    		  // look up the attribute id and set values accordingly
    		  checkForAttrValue(eMention, a.role, a.value);
    	  }
    	  
    	  // add to CAS
    	  eMention.addToIndexes();
      }
	}

	private List getLeafAttributes(String id,
			HashMap> mentionAttr,
			HashMap attrPtr, HashMap attrs, List output) {

		// if this is a mention id
		if (mentionAttr.containsKey(id)) {
//			if (mentionAttr.get(id).size()>1 && VERBOSE) {
//				System.err.println("WARNING: expected an attribute's mention to have only one attr, but not so");
//			}
			for (String attrId : mentionAttr.get(id)) {
				// assumes that if you're in an attribute mention, you only have one value
				if (attrPtr.containsKey(attrId)) {
					ArgumentInfo a = attrPtr.get(attrId);
					if ( !isRelationArgument(attrPtr.get(attrId).role) )
						getLeafAttributes(attrPtr.get(attrId).value, mentionAttr, attrPtr, attrs, output);
					
				} else if (attrs.containsKey(attrId)){
					ArgumentInfo a = attrs.get(attrId);
					if ( !isRelationArgument(attrs.get(attrId).role) )
						output.add(attrs.get(attrId));
				}
			}
		} 
		
		// if this is an attribute id
		else if (attrPtr.containsKey(id)) {
			if ( !attrPtr.get(id).role.equals("Related_to_CU") 
					& !attrPtr.get(id).role.equals("Argument_CU") )
				getLeafAttributes(attrPtr.get(id).value, mentionAttr, attrPtr, attrs, output);
		} else if (attrs.containsKey(id)){
			if ( !attrs.get(id).role.equals("Related_to_CU") 
					&& !attrs.get(id).role.equals("Argument_CU") )
				output.add(attrs.get(id));
		}
		
		return output;
	}

	private boolean isRelationArgument(String role) {
		if (normalizeRoleName(role).equals("Related_to")) {
			return true;
		} else if (normalizeRoleName(role).equals("Argument")) {
			return true;
		}
		return false;
	}

	private void checkForAttrValue(IdentifiedAnnotation eMention, String role,
			String value) {
		if (role.contains("_normalization")) {
			  if (role.startsWith("conditional")) {
				  eMention.setConditional(Boolean.valueOf(value));
			  } else if (role.startsWith("generic")) {
				  eMention.setGeneric(Boolean.valueOf(value));
			  } else if (role.startsWith("negation_indicator")) {
				  // assumes that the string from Knowtator is exactly "negation_present"
				  if (value.equals("negation_present")) { 
					  eMention.setPolarity(CONST.NE_POLARITY_NEGATION_PRESENT);
				  } else {
					  eMention.setPolarity(CONST.NE_POLARITY_NEGATION_ABSENT);
				  }
			  } else if (role.startsWith("subject")) {
				  // assumes that the strings from Knowtator are exactly what's in the type system
				  eMention.setSubject(value);
			  } else if (role.startsWith("uncertainty_indicator")) {
				  // assumes that the string from Knowtator is exactly "indicator_present"
				  if (value.equals("indicator_present")) { 
					  eMention.setUncertainty(CONST.NE_UNCERTAINTY_PRESENT);
				  } else {
					  eMention.setUncertainty(CONST.NE_UNCERTAINTY_ABSENT);
				  }
			  } else if (role.startsWith("generic")) {
				  eMention.setGeneric(Boolean.valueOf(value));
			  }
		  }
	}

	// Takes the Knowtator schema value and filters out things that are not NE.
	//   In principle can have a parallel "filterToAttributesOnly"
	private HashMap> filterToNamedEntitiesOnly(
			HashMap> entityMentions,
			HashMap entityTypes) {
		HashMap> newEntityMentions = new HashMap>(); 
		
		for (Entry etype : entityTypes.entrySet()) {
			if (etype.getValue().equals("Anatomical_site") 
					|| etype.getValue().equals("Disease_Disorder")					
						|| etype.getValue().equals("Lab")					
							|| etype.getValue().equals("Medications")					
								|| etype.getValue().equals("Procedure")					
									|| etype.getValue().equals("Sign_symptom")					
			) {
				if (entityMentions.containsKey(etype.getKey())) {
					newEntityMentions.put(etype.getKey(),entityMentions.get(etype.getKey()));
				}
			}
		}
		
		return newEntityMentions;
	}
	
	// Takes the Knowtator schema value and filters out things that are not NE.
	//   In principle can have a parallel "filterToAttributesOnly"
	private boolean filterToNamedEntitiesOnly(
			HashMap> entityMentions,
			String typeKey, String typeValue) {

		if (typeValue.toLowerCase().equals("Anatomical_site") 
				|| typeValue.toLowerCase().equals("Disease_Disorder")					
				|| typeValue.toLowerCase().equals("Lab")					
				|| typeValue.toLowerCase().equals("Medications")					
				|| typeValue.toLowerCase().equals("Procedure")					
				|| typeValue.toLowerCase().equals("Sign_symptom")					
		) {
			if (entityMentions.containsKey(typeKey)) {
				return true;
			}
		}

		return false;
	}
		
	/**
	 * Convert Argument_CU and Related_to_CU to Argument and Related_to.
	 * This will not be necessary in the future when the data will be 
	 * post-processed to remove _CU suffixes. 
	 * 
	 * Currently mipacq data does not have the suffixes and sharp data does.
	 */
	private static String normalizeRoleName(String role) {

		if(role.equals("Argument_CU")) {
			return "Argument";
		} 

		if(role.equals("Related_to_CU")) {
			return "Related_to";
		}

		return role;

	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy