org.apache.ctakes.dependency.parser.DependencyNodeWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of ctakes-dependency-parser Show documentation
This wraps the ClearNLP dependency parser and semantic role labeler into a UIMA friendly annotator
There is a newer version: 6.0.0
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.ctakes.dependency.parser;
/*
 * Copyright: (c) 2010   Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0 
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */


import org.apache.ctakes.core.pipeline.PipeBitInfo;
import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
import org.apache.log4j.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;


/**
 * CasConsumer that writes a JCas (the current view) to an xml file
 *
 * @author Mayo Clinic
 */
@PipeBitInfo(
      name = "Dependency Node Writer",
      description = "Writes information about Dependency Nodes to file.",
      role = PipeBitInfo.Role.WRITER,
      dependencies = { PipeBitInfo.TypeProduct.SENTENCE, PipeBitInfo.TypeProduct.DEPENDENCY_NODE }
)
public class DependencyNodeWriter extends CasConsumer_ImplBase {
   // LOG4J logger based on class name
   private Logger iv_logger = Logger.getLogger( getClass().getName() );

   private String iv_outputDir = null;
   private String iv_outputFormat = null;

   // iv_procCount is used to name the output files sequentially if there
   // is a problem with naming based on source names
   private int iv_procCount = 0;


   /**
    * Read in configuration parameters
    */
   public void initialize() throws ResourceInitializationException {
      iv_outputDir = (String)getConfigParameterValue( "outputDir" );
      iv_outputFormat = (String)getConfigParameterValue( "outputFormat" );
   }


   /**
    * Write a tab-delimited file containing data from the view.
    * The file name will come from the DocumentID annotation,
    * which is associated with a view.
    */
   private void processView( JCas jCas ) throws Exception {
      // String docText = view.getDocumentText();

      String docName = DocumentIDAnnotationUtil.getDocumentID( jCas );

      File outputFile;
      if ( docName == null || docName.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
         docName = "doc" + iv_procCount + "." + iv_outputFormat.toLowerCase();
      } else {
         docName = docName + "." + iv_outputFormat.toLowerCase();
         //	if (!docName.endsWith(".xml")) {
         //    	docName = docName + ".xml";
         //	}
      }

//        OutputStream out=null;
      try {
         File outputDir = new File( iv_outputDir );
         outputDir.mkdirs();
         outputFile = new File( iv_outputDir + File.separatorChar + docName );
//            out = new FileOutputStream(outputFile);
//            XCASSerializer.serialize(view.getCas(), out, true); // true -> formats the output
         outputFile.createNewFile();
         BufferedWriter bw = new BufferedWriter( new FileWriter( outputFile ) );

         AnnotationIndex nodeIndex = jCas.getAnnotationIndex( ConllDependencyNode.type );
         FSIterator sentences = jCas.getAnnotationIndex( Sentence.type ).iterator();

         while ( sentences.hasNext() ) {
            Sentence sentence = (Sentence)sentences.next();


            ConllDependencyNode node = null;
            FSIterator nodeIterator = nodeIndex.subiterator( sentence );
            while ( nodeIterator.hasNext() ) {
//                    int pID = (node==null)? 0 : node.getID();
               node = (ConllDependencyNode)nodeIterator.next();

               if ( node.getId() != 0 ) { // && node.getID() !=pID) {

                  if ( iv_outputFormat.toLowerCase().contains( "min" ) ) {
                     bw.write( node.getId() + "\t" );
                     bw.write( node.getForm() + "\t" );
                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
                     bw.write( node.getDeprel() + "\n" );
                  } else if ( iv_outputFormat.toLowerCase().contains( "mpos" ) ) {
                     bw.write( node.getId() + "\t" );
                     bw.write( node.getForm() + "\t" );
                     bw.write( node.getPostag() + "\t" );
                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
                     bw.write( node.getDeprel() + "\n" );
                  } else if ( iv_outputFormat.toLowerCase().contains( "mlem" ) ) {
                     bw.write( node.getId() + "\t" );
                     bw.write( node.getForm() + "\t" );
                     bw.write( node.getLemma() + "\t" );
                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
                     bw.write( node.getDeprel() + "\n" );
                  } else if ( iv_outputFormat.toLowerCase().contains( "dep" ) ) {
                     bw.write( node.getId() + "\t" );
                     bw.write( node.getForm() + "\t" );
                     bw.write( node.getLemma() + "\t" );
                     bw.write( node.getPostag() + "\t" );
                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
                     bw.write( node.getDeprel() + "\n" );
                  } else { //if (iv_outputFormat.toLowerCase().contains("conll")) {
                     bw.write( node.getId() + "\t" );
                     bw.write( node.getForm() + "\t" );
                     bw.write( node.getLemma() + "\t" );
                     bw.write( node.getCpostag() + "\t" );
                     bw.write( node.getPostag() + "\t" );
                     bw.write( node.getFeats() + "\t" );
                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
                     bw.write( node.getDeprel() + "\t" );
                     bw.write( (node.getPhead() == null ? "_" : node.getPhead().getId()) + "\t" );
                     bw.write( node.getPdeprel() + "\n" );
                  }
               }

            }
            bw.write( "\n" );


         }
         bw.flush();

      } finally {
         iv_procCount++;
//	        if (out != null) {
//	        	out.close();
//	        }
      }

   }


   /**
    * Create an xml file from the data in the cas.
    */
   public void processCas( CAS cas ) throws ResourceProcessException {

      iv_logger.info( "Started" );

      try {

         JCas currentView = cas.getCurrentView().getJCas();
         processView( currentView );

      } catch ( Exception e ) {
         throw new ResourceProcessException( e );
      }

   }

}