org.apache.uima.ducc.sampleapps.DuccCasCM Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of uima-ducc-examples Show documentation
There is a newer version: 3.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.ducc.sampleapps;

/*
 * This sample Cas Multiplier reads compressed CASes from a specified zipfile
 * and returns each as a child CAS. A zipfile may contain zip-compressed XMI
 * format CASes or UIMA compressed binary form 6 format CASes. 
 * 
 * See more information in DUCC Book chapters on sample applications.
 * 
 */

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.uima.UIMAFramework;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.Serialization;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.ducc.Workitem;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLParser;

public class DuccCasCM extends JCasMultiplier_ImplBase {
  private String inputFileName;
  private String outputFileName;
  private FileInputStream fis;
  private ZipInputStream zis;
  private ZipEntry nextEntry;
  private Workitem wi;
  private int docInWI;
  private boolean readingXmiFormat;
  private TypeSystem inputTS;
  private Logger logger;

  public boolean hasNext() throws AnalysisEngineProcessException {
	try {
		nextEntry = zis.getNextEntry();
	} catch (IOException e) {
		throw new AnalysisEngineProcessException(e);
	}
	  return (nextEntry != null) ? true : false;
  }

  public AbstractCas next() throws AnalysisEngineProcessException {
    JCas newcas = getEmptyJCas();
    if (0 == docInWI) {
    	if (nextEntry.getName().equals("typesystem.xml")) {
        	getTypesystem();
        	readingXmiFormat = false;
        }
        else {
        	readingXmiFormat = true;
        }
    }
    else {
  	  if (nextEntry.getName().equals("typesystem.xml")) {
		  throw new AnalysisEngineProcessException(new RuntimeException(
				  "typesystem.xml entry found in the middle of input zipfile "+inputFileName));
	  }
    }
    byte[] buff = new byte[10000];
    int bytesread;
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try {
    	while (-1 != (bytesread = zis.read(buff))) {
    		baos.write(buff,0,bytesread);
    	}
        ByteArrayInputStream bis = new ByteArrayInputStream(baos.toByteArray());
        if (readingXmiFormat) {
        	XmiCasDeserializer.deserialize(bis, newcas.getCas());
        }
        else {
        	Serialization.deserializeCAS(newcas.getCas(), bis, inputTS, null);
        }
	} catch (Exception e) {
		throw new AnalysisEngineProcessException(e);
	}
    Iterator fsit = newcas.getIndexRepository().getAllIndexedFS(newcas.getCasType(DuccDocumentInfo.type));
    DuccDocumentInfo di;
    if (fsit.hasNext()) {
    	di = (DuccDocumentInfo) fsit.next();
    }
    else {
        di = new DuccDocumentInfo(newcas);
        di.addToIndexes();
    }
    di.setInputfile(inputFileName);
    di.setOutputfile(outputFileName);
    di.setDocseq(docInWI++);
    return newcas;
  }

  @Override
  public void process(JCas jcas) throws AnalysisEngineProcessException {
    Iterator fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
    if (!fsit.hasNext()) {
      throw new AnalysisEngineProcessException(new RuntimeException("No workitem FS in CAS"));
    }
    wi = (Workitem) fsit.next();
    logger.log(Level.INFO, "DuccCasCM: inputs "+wi.getInputspec()+" outputs "+wi.getOutputspec());
    try {
      openInputFile(wi);
    } catch (IOException e) {
      throw new AnalysisEngineProcessException(e);
    }
  }


  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    logger = aContext.getLogger();
  }


  private void openInputFile(Workitem wi) throws IOException {
    inputFileName = wi.getInputspec();
    outputFileName = wi.getOutputspec();
    fis = new FileInputStream(new File(inputFileName));
    zis = new ZipInputStream(new BufferedInputStream(fis,1024*100));
    docInWI = 0;
  }


  private void getTypesystem() throws AnalysisEngineProcessException {
	  byte[] buff = new byte[10000];
	  int bytesread;
	  ByteArrayOutputStream baos = new ByteArrayOutputStream();
	  try {
	  	while (-1 != (bytesread = zis.read(buff))) {
	  		baos.write(buff,0,bytesread);
	  	}
	  	ByteArrayInputStream bis = new ByteArrayInputStream(baos.toByteArray());
	  	// Get XML parser from framework
	  	XMLParser xmlParser = UIMAFramework.getXMLParser();
	  	// Parse type system descriptor
	  	TypeSystemDescription tsDesc = xmlParser.parseTypeSystemDescription(new XMLInputSource((InputStream)bis,null));
	  	// Use type system description to create CAS and get the type system object
	  	inputTS = CasCreationUtils.createCas(tsDesc, null, null).getTypeSystem();
	  	// advance to first input CAS
		nextEntry = zis.getNextEntry();
		} catch (Exception e) {
			throw new AnalysisEngineProcessException(e);
		}
  }
}