de.datexis.ner.reader.CoNLLDatasetReader Maven / Gradle / Ivy

Go to download
package de.datexis.ner.reader;

import de.datexis.common.Resource;
import de.datexis.model.Annotation;
import de.datexis.model.Dataset;
import de.datexis.model.Document;
import de.datexis.model.Token;
import de.datexis.model.tag.BIO2Tag;
import de.datexis.model.tag.BIO2Tag.Label;
import de.datexis.ner.MentionAnnotation;
import de.datexis.preprocess.DocumentFactory;
import de.datexis.reader.DatasetReader;
import org.apache.commons.io.LineIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import static de.datexis.common.WordHelpers.skipSpaceAfter;
import static de.datexis.common.WordHelpers.skipSpaceBefore;

/**
 * Reads a Dataset from CoNLL formatted file.
 * @author sarnold
 */
public class CoNLLDatasetReader implements DatasetReader {

	private static final Logger log = LoggerFactory.getLogger(CoNLLDatasetReader.class);
  
  public static enum Charset { UTF_8, ISO_8859_1 };
  
  private static final String LINE_START = "-DOCSTART-";
  
  protected boolean useFirstSentenceAsTitle = false;
  
  protected Annotation.Source annotationSource = Annotation.Source.GOLD;
  
  protected int tagIndex = -1;
  
  protected String type = null;
  
  protected String name;
  
  /**
   * Use a specific name for the Dataset.
   */
  public CoNLLDatasetReader withName(String name) {
    this.name = name;
    return this;
  }
  
  /**
   * Use a specific column as NER tag.
   * @param tagIndex Use this index, starting from 0. Default: last column.
   */
  public CoNLLDatasetReader withTagIndex(int tagIndex) {
    this.tagIndex = tagIndex;
    return this;
  }
  
  /**
   * Use a copy of every first sentence as Document title.
   */
  public CoNLLDatasetReader withFirstSentenceAsTitle(boolean useFirstSentence) {
    this.useFirstSentenceAsTitle = useFirstSentence;
    return this;
  }
  
  /**
   * @param annotationSource Assign this source to all Annotations read from the file.
   */
  public CoNLLDatasetReader withAnnotationSource(Annotation.Source annotationSource) {
    this.annotationSource = annotationSource;
    return this;
  }
  
  /**
   * @param type Use this type instead of the given ones in the dataset,
   */
  public CoNLLDatasetReader withGenericType(String type) {
    this.type = type;
    return this;
  }
  
  /**
   * Read a Dataset from CoNLL file
   */
  @Override
  public Dataset read(Resource path) throws IOException {
    return read(path, Charset.UTF_8);
  }
  
  /**
   * Read a Dataset from CoNLL file
   */
	public Dataset read(Resource path, Charset charset) throws IOException {
    log.info("Reading Dataset from `{}`...", path.toString());
    Dataset data;
    try(InputStream in = path.getInputStream()) {
      CharsetDecoder cs;
      if(charset.equals(Charset.UTF_8)) cs = StandardCharsets.UTF_8.newDecoder();
      else cs = StandardCharsets.ISO_8859_1.newDecoder();
      BufferedReader br = new BufferedReader(new InputStreamReader(in, cs));
      data = readLines(new LineIterator(br));
    }
    if(this.name != null) data.setName(name);
    else data.setName(path.getFileName().replaceFirst("\\..+$", ""));
    return data;
	}
  
  /**
   * Read a Dataset from CoNLL file (static version with default reader)
   */
  public static Dataset readDataset(Resource path, String name, Charset charset) throws IOException {
    final CoNLLDatasetReader reader = new CoNLLDatasetReader();
    Dataset data = reader.read(path, charset);
    data.setName(name);
    return data;
  }
  
  /**
	 * Create a Document from the given data
	 * @return - Document created from data
	 */
	protected Dataset readLines(Iterator lines) {
    
		Dataset result = new Dataset();
		List tokens = new ArrayList<>();
		Token token = null;
		String type = null;

		int cursor = 0;
    String last = "";
		while(lines.hasNext()) {
			String line = lines.next().trim();
			if(line.startsWith(LINE_START)) {
        // end document
        if(!tokens.isEmpty()) {
          Document document = DocumentFactory.fromTokens(tokens);
          MentionAnnotation.annotateFromTags(document, annotationSource, BIO2Tag.class);
          result.addDocument(document);
        }
        // start new document
        cursor = 0;
				tokens = new ArrayList<>();
        type = null;
        last = "";
      } else if(line.length() == 0) {
        // end sentence
        if(!tokens.isEmpty()) {
          token = new Token("\n", cursor, cursor + 1);
          token.putTag(annotationSource, BIO2Tag.O());
          tokens.add(token);
          cursor = token.getEnd();
          last = token.getText();
        }
        type = null;
      } else if(line.length() > 0) {
        // read token
        token = createTokenFromLine(line, cursor, type);
				if(token != null) {
          if(!skipSpaceAfter.contains(last) && !skipSpaceBefore.contains(token.getText())) {
            token.setBegin(token.getBegin() + 1);
            token.setEnd(token.getEnd() + 1);
          }
					tokens.add(token);
          cursor = token.getEnd();
					type = token.getTag(annotationSource, BIO2Tag.class).getType();
          last = token.getText();
				}
      }
		}
		// end document
    if(!tokens.isEmpty()) {
      Document document = DocumentFactory.fromTokens(tokens);
      MentionAnnotation.annotateFromTags(document, annotationSource, BIO2Tag.class);
      result.addDocument(document);
    }
    
    for(Document doc : result.getDocuments()) {
      if(useFirstSentenceAsTitle) {
        if(doc.countSentences() > 0) {
          doc.setTitle(doc.getSentence(0).getText());
        } else {
          doc.setTitle("");
        }
      }
      doc.setTagAvailable(annotationSource, BIO2Tag.class, true);
    }
    
    log.info(String.format("Finished reading dataset (%,d docs, %,d sentences, %,d tokens, %,d mentions)", 
            result.countDocuments(), result.countSentences(), result.countTokens(), result.countAnnotations()));
    
		return result;
	}
   
	/**
	 * Creates Token from the given line of CoNLL2003 data
	 * @param line - CoNLL2003 data to create Token
	 * @param cursor - character index in the whole document
	 * @return Token created from line
	 */
	protected Token createTokenFromLine(String line, int cursor, String prevType) {
		try {
      String[] csv = line.split("\\s+");
      int pos = tagIndex >= 0 ? tagIndex : csv.length - 1;
			String text = csv[0];
			BIO2Tag tag = createTag(csv[pos], prevType);
			int start = cursor;
			int end = cursor + text.length();
			Token token = new Token(text, start, end);
			token.putTag(annotationSource, tag);
			return token;
		} catch (Exception e) {
			log.warn("could not read line: " + line);
			return null;
		}
	}

  /**
   * Returns the NER Label based on current and previous tags
   */
  protected BIO2Tag createTag(String label, String prevType) {
		String[] parts = label.split("\\-");
		String tag = parts[0];
		String type;
    if(this.type != null) {
      type = this.type;
    } else {
      type = parts.length > 1 ? parts[1] : MentionAnnotation.Type.GENERIC;
    }
    switch(tag) {
      case "O":
        return new BIO2Tag(Label.O, null);
      case "B":
        return new BIO2Tag(Label.B, type);
      case "I":
        if(type.equals(prevType)) return new BIO2Tag(Label.I, type);
        else return new BIO2Tag(Label.B, type);
      default:
        log.warn("reading unknown tag " + label);
        return new BIO2Tag(Label.O, null);
		}
	}
  
}