All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.cmu.lti.oaqa.cse.tools.FileCollectionReader Maven / Gradle / Ivy

/*
 *  Copyright 2012 Carnegie Mellon University
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package edu.cmu.lti.oaqa.cse.tools;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceConfigurationException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;

import edu.cmu.lti.oaqa.framework.types.InputElement;

/**
 * File System Collection Reader for questions with oaqa-specific initialization
 * 
 */
public class FileCollectionReader extends CollectionReader_ImplBase {

	public static final String PARAM_QUESTIONFILE = "questionFile";

  public final Pattern pQuestionLine = Pattern
      .compile("^([^\\s]+)\\s+([^\\s].*)$");

	private String dataset;
	
	private BufferedReader reader;
	
	private String line;
	
	private int count = 0;

	@Override
	public void initialize() throws ResourceInitializationException {

    this.dataset = (String) getConfigParameterValue("dataset");
		File file = new File(
				((String) getConfigParameterValue(PARAM_QUESTIONFILE)).trim());
		
		getLogger().log(Level.INFO,"Loading questions from " 
		        + file.getAbsolutePath());

		// if input file does not exist, throw exception
		if (!file.exists()) {
		  getLogger().log(Level.SEVERE,"The question file does not exist.");
			throw new ResourceInitializationException(
					ResourceConfigurationException.DIRECTORY_NOT_FOUND,
					new Object[] { PARAM_QUESTIONFILE,
							this.getMetaData().getName(),
							file.getPath() });
		}

		try {
		this.reader = new BufferedReader(
		        new InputStreamReader(new FileInputStream(file), "UTF8"));
		} catch (Exception e) {
			throw new ResourceInitializationException(e);
		} 
	}


  @Override
  public boolean hasNext() {
    try {
      return (line = reader.readLine()) != null;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
	
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    try {
      JCas jcas = aCAS.getJCas();
      line = line.trim();
      if (line.length() == 0 || line.matches("^\\s*#.+")) {
        return;
      }

      Matcher mQuestionLine = pQuestionLine.matcher(line);

      if (mQuestionLine.find()) {
        String questionId = mQuestionLine.group(1);
        String questionText = mQuestionLine.group(2);

        // Annotate as input element
        InputElement next = new InputElement(jcas);
        next.setDataset(dataset);
        next.setQuestion(questionText);
        next.setAnswerPattern("");
        next.setSequenceId(questionId);
        next.addToIndexes();
      }
      count++;
    } catch (CASException e) {
      throw new CollectionException(e);
    }
  }

  @Override
	public void close() throws IOException {
     reader.close();
	}

  @Override
	public Progress[] getProgress() {
		return new Progress[] { new ProgressImpl(count, -1, Progress.ENTITIES) };
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy