All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.uima.ducc.sampleapps.DuccTextCM Maven / Gradle / Ivy

There is a newer version: 3.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.uima.ducc.sampleapps;

/*
 * This sample Cas Multiplier uses paragraph boundaries to segment a text file, 
 * or a part of a text file, into multiple documents. A child CAS is created
 * for each document. Paragraphs that cross block boundaries are processed
 * in the block where they started. An error is thrown if a paragraph crosses 
 * two block boundaries.
 * 
 * See more information in DUCC Book chapters on sample applications.
 * 
 */

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.ducc.Workitem;
import org.apache.uima.ducc.sampleapps.DuccDocumentInfo;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Logger;

public class DuccTextCM extends JCasMultiplier_ImplBase {
  private byte[] buffer = null;
  private int buffsize;
  private FileInputStream fis;
  private String inputFileName;
  private String outputFileName;
  private String language;
  private String encoding;
  private String nextDoc;
  private int nextDocOffset;
  private int bytelength;
  private int blockindex;
  private boolean newWI;
  private boolean spilled;
  private boolean firstdoc;
  private boolean lastblock;
  private int docInWI;
  private long filesize;
  private Workitem wi;
  private int currentindex;
  private Logger logger;
  FileChannel fc;

  private enum NextDoc { FIRSTDOC, SEP_IN_LASTBLOCK, NORMAL };
  private NextDoc strategy;
  
  private final int DEFAULT_BUFFER_SIZE = 20000000;

  public boolean hasNext() throws AnalysisEngineProcessException {
	if (spilled) {
	  return false;
	}
	try {
      return findnextdoc(strategy);
	} catch (IOException e) {
	  throw new AnalysisEngineProcessException(e);
	}
  }

  public AbstractCas next() throws AnalysisEngineProcessException {
    JCas newcas = getEmptyJCas();
    newcas.setDocumentText(getNextDocument());
    newcas.setDocumentLanguage(language);
    DuccDocumentInfo di = new DuccDocumentInfo(newcas);
    di.setInputfile(inputFileName);
    di.setOutputfile(outputFileName);
    di.setDocseq(docInWI++);
    di.setByteoffset(wi.getBlockindex() * wi.getBlocksize() + nextDocOffset);
    di.addToIndexes();
    return newcas;
  }

  @Override
  public void process(JCas jcas) throws AnalysisEngineProcessException {
    Iterator fsit = jcas.getIndexRepository().getAllIndexedFS(jcas.getCasType(Workitem.type));
    if (!fsit.hasNext()) {
      throw new AnalysisEngineProcessException(new RuntimeException("No workitem FS in CAS"));
    }
    wi = (Workitem) fsit.next();
    logger.log(Level.INFO, "DuccTextCM: "+wi.getInputspec()+" at block "+wi.getBlockindex()+" length "+wi.getBytelength()+
    		" offset "+wi.getBlockindex() * wi.getBlocksize()+" outputs "+wi.getOutputspec());
    try {
      openInputFile(wi);
    } catch (IOException e) {
      throw new AnalysisEngineProcessException(e);
    }

    if (buffer == null) {
      if (wi.getBlocksize()>0) {
    	buffer = new byte[wi.getBlocksize() * 2];
    	buffsize = wi.getBlocksize() * 2;
      }
      else {
    	buffer = new byte[DEFAULT_BUFFER_SIZE];
    	buffsize = DEFAULT_BUFFER_SIZE;
      }
    }
    else {
      if (wi.getBytelength() > buffsize) {
    	buffer = new byte[wi.getBytelength() * 2];
        buffsize = wi.getBytelength();
      }
    }

    spilled = false;
    docInWI = 0;
    strategy = (blockindex == 0) ? NextDoc.FIRSTDOC : NextDoc.NORMAL;
  }


  public void initialize(UimaContext aContext) throws ResourceInitializationException {
    super.initialize(aContext);
    logger = aContext.getLogger();
  }


  private void openInputFile(Workitem wi) throws IOException {
    inputFileName = wi.getInputspec();
    outputFileName = wi.getOutputspec();
    bytelength = wi.getBytelength();
    blockindex = wi.getBlockindex();
    lastblock = wi.getLastBlock();
    language = wi.getLanguage();
    fis = new FileInputStream(new File(inputFileName));
    encoding = (null==wi.getEncoding()) ? "UTF-8" : wi.getEncoding();
    fc = fis.getChannel();
    long start = wi.getBlockindex() * wi.getBlocksize();
    filesize = fc.size();
    if (start > filesize) {
      throw new IOException("Specifid start position beyond end of input file "+inputFileName);
    }
    fis.skip(start);
	newWI = true;
  }

  private boolean findnextdoc(NextDoc condition) throws IOException {
    int startloc=-1;

    if (newWI) {
      newWI = false;
      int len = fis.read(buffer,0,bytelength);
      if (len != bytelength) {
    	throw new IOException("Read "+len+" bytes, expected "+bytelength);
      }
   	  currentindex = 0;
    }

    if (condition.equals(NextDoc.SEP_IN_LASTBLOCK)) {
    	// separator found at end of last block
    	if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
      	  return false;
      	}
      	if (10 == buffer[currentindex]) {
      	  currentindex++; // point at first char in Doc
      	}
      	startloc=currentindex;

        // find end of next doc
        int endloc=0;
        while (currentindex < (bytelength-1)) {
          if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
        	endloc = currentindex - 1;
        	break;
          }
          else {
        	currentindex++;
          }
        }
        if (endloc == 0) {
          throw new RuntimeException("Document larger than "+bytelength+" found in "+inputFileName+" block "+blockindex);
        }
        byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
        nextDoc = new String(docbytes, encoding);
        nextDocOffset = startloc;
        return true;
      }

    if (condition.equals(NextDoc.FIRSTDOC)) {
      // special handling at beginning of first block
      // skip any leading EOL to find start of first doc
      // only execute this once
      strategy = NextDoc.NORMAL;
      while (10 == buffer[currentindex]) {
    	currentindex++;
    	if (currentindex == bytelength) {
    	  if (firstdoc) {
    		return false; // nothing but newlines in this block
    	  }
    	}
      }
    }

    if (condition.equals(NextDoc.NORMAL)) {
    	// currentindex either pointing at start of a segmentation, or 
    	// if a new block then possibly the middle of a previous document
      if (!(10 == buffer[currentindex] && 10 == buffer[currentindex+1])) {
      	// in the middle of a spilled Doc. Find next segmentation
      	while (currentindex < (bytelength-1)) {
      	  if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
      		break;
      	  }
      	  else {
      		currentindex++;
      	  }
      	}
      }
      if ( currentindex == bytelength-1) {
    	fis.close();
    	return false;
      }
      // now pointing at start of a segmentation, find start/end of next Doc
      while (10 == buffer[currentindex]) {
    	currentindex++;
    	if (currentindex == bytelength) {
    	  if (lastblock) {
    		fis.close();
    		return false;
    	  }
          // read next block and continue looking for end of Doc
    	  int len = fis.read(buffer,bytelength,bytelength);
    	  if (len <= 0) {
            throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
    	  }
    	  fis.close();
    	  spilled = true;
    	  bytelength += len;
    	  return findnextdoc(NextDoc.SEP_IN_LASTBLOCK);
    	}
      }
    }

    startloc = currentindex;
    // find end of Doc
    int endloc=0;
    while (currentindex < (bytelength-1)) {
      if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
    	endloc = currentindex - 1;
      	break;
      }
      else {
    	currentindex++;
      }
    }

      if (endloc == 0) {
    	if (lastblock) {
    	  endloc = bytelength-1;
    	}
    	else {
    	  // read next block and continue looking for end of Doc
          int len = fis.read(buffer,bytelength,bytelength);
          if (len <= 0) {
        	throw new IOException("Read "+len+" bytes for "+inputFileName+" block "+blockindex+1);
          }
          fis.close();
          spilled = true;
          bytelength += len;
    	}
        while (currentindex < (bytelength-1)) {
          if (10 == buffer[currentindex] && 10 == buffer[currentindex+1]) {
        	endloc = currentindex - 1;
          	break;
          }
          else {
          	currentindex++;
          }
        }
        endloc = currentindex - 1;
      }
      byte [] docbytes = Arrays.copyOfRange(buffer, startloc, endloc);
      nextDoc = new String(docbytes, encoding);
      nextDocOffset = startloc;
      return true;
  }

  private String getNextDocument() {
    return nextDoc;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy