edu.jhu.hlt.concrete.ingesters.conll.Conll2011 Maven / Gradle / Ivy

Go to download
package edu.jhu.hlt.concrete.ingesters.conll;

import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.jhu.hlt.concrete.AnnotationMetadata;
import edu.jhu.hlt.concrete.Communication;
import edu.jhu.hlt.concrete.Entity;
import edu.jhu.hlt.concrete.EntityMention;
import edu.jhu.hlt.concrete.EntityMentionSet;
import edu.jhu.hlt.concrete.EntitySet;
import edu.jhu.hlt.concrete.Section;
import edu.jhu.hlt.concrete.Sentence;
import edu.jhu.hlt.concrete.SituationMention;
import edu.jhu.hlt.concrete.SituationMentionSet;
import edu.jhu.hlt.concrete.TextSpan;
import edu.jhu.hlt.concrete.Token;
import edu.jhu.hlt.concrete.Tokenization;
import edu.jhu.hlt.concrete.TokenizationKind;
import edu.jhu.hlt.concrete.ingesters.base.IngestException;
import edu.jhu.hlt.concrete.ingesters.base.stream.StreamBasedStreamIngester;
import edu.jhu.hlt.concrete.serialization.CommunicationTarGzSerializer;
import edu.jhu.hlt.concrete.serialization.TarGzCompactCommunicationSerializer;
import edu.jhu.hlt.concrete.util.ProjectConstants;
import edu.jhu.hlt.concrete.util.Timing;

/**
 * Expects a CoNLL-2011 formatted file, which is a single document with multiple
 * parts (each of which is captured as a {@link Section}).
 * 


 * Adds the following:
 * - Tokenization for the words
 * - TokenTagging for the POS tags
 * - Parse for the constituency parse
 * - SituationMentionSet for the SRL labels
 * - EntitySet and EntityMentionSet for the coref labels
 * - EntityMentionSet for the NER labels (on by default)
 * - TokenTagging for NER labels (on by default)
 * 


 * Does not (currently) ingest:
 * - word senses
 * - speaker or author
 * 


 * See: http://conll.cemantix.org/2011/data.html
 * 


 * Note: This ingester does not attempt to merge the EntityMentionSet produced
 * by the coref annotations with the one generated (optionally) by the NER labels.
 */
public class Conll2011 implements StreamBasedStreamIngester {

  private static final Logger LOGGER = LoggerFactory.getLogger(Conll2011.class);

  private static final int kbest = 1;
  private static final long timestamp = Timing.currentLocalTime();
  private static final Pattern p = Pattern.compile("^#begin document \\((\\S+)\\); part (\\S+)$");


  static final AnnotationMetadata META_GENERAL = new AnnotationMetadata("conll-2011", timestamp, kbest);
  static final AnnotationMetadata META_COREF = new AnnotationMetadata("conll-2011 coref", timestamp, kbest);
  static final AnnotationMetadata META_PARSE = new AnnotationMetadata("conll-2011 parse", timestamp, kbest);
  static final AnnotationMetadata META_NER = new AnnotationMetadata("conll-2011 NER", timestamp, kbest);
  static final AnnotationMetadata META_POS = new AnnotationMetadata("conll-2011 POS", timestamp, kbest);
  static final AnnotationMetadata META_SRL = new AnnotationMetadata("conll-2011 SRL", timestamp, kbest);

  static final String SECTION_TYPE = "Passage";

  public boolean addNerAsTokenTagging = true;
  public boolean addNerAsEntityMentionSet = true;
  public boolean includeSingleTokenConstituents = true;

  public boolean includeDebugInfo = false;

  private final Path ingestPath;
  private final Predicate keep;

  public boolean debug = false;
  public boolean warnOnEmptyCoref = true;

  public Conll2011(Path ingestPath, Predicate keep) {
    this.ingestPath = ingestPath;
    this.keep = keep;
  }

  public static int count(char c, String s) {
    int count = 0;
    for (char sc : s.toCharArray())
      if (sc == c)
        count++;
    return count;
  }

  /**
   * Merges all of the Communication-level lists ({@link SituationMentionSet},
   * {@link EntitySet}, {@link EntityMentionSet}) from the second arg the first.
   * Expects a single theory on both sides.
   */
  public static void mergeInto(Communication addTo, Communication singleSection) {
    if (singleSection.getSectionListSize() != 1)
      throw new IllegalArgumentException();

    addTo.addToSectionList(singleSection.getSectionList().get(0));

    // Merge SituationMentionSet
    if (addTo.getSituationMentionSetList().size() != 1)
      throw new IllegalArgumentException();
    if (singleSection.getSituationMentionSetList().size() != 1)
      throw new IllegalArgumentException();
    SituationMentionSet toSms = addTo.getSituationMentionSetList().get(0);
    SituationMentionSet fromSms = singleSection.getSituationMentionSetList().get(0);
    for (SituationMention sm : fromSms.getMentionList())
      toSms.addToMentionList(sm);
    // NOTE: This will drop any other data in SituationMentionSet, etc.
    // Thrift doesn't seem to have any workable mergeFrom(thriftObjA, thriftObjB)
    // to support this without dropping some fields.

    // Merge EntitySet
    if (addTo.getEntitySetListSize() != 1)
      throw new IllegalArgumentException();
    if (singleSection.getEntitySetListSize() != 1)
      throw new IllegalArgumentException();
    EntitySet toEs = addTo.getEntitySetList().get(0);
    EntitySet fromEs = singleSection.getEntitySetList().get(0);
    for (Entity e : fromEs.getEntityList())
      toEs.addToEntityList(e);

    // Merge EntityMentionSet
    if (addTo.getEntityMentionSetListSize() !=
        singleSection.getEntityMentionSetListSize()) {
      throw new IllegalArgumentException();
    }
    for (int i = 0; i < addTo.getEntityMentionSetListSize(); i++) {
      EntityMentionSet toEms = addTo.getEntityMentionSetList().get(i);
      EntityMentionSet fromEms = singleSection.getEntityMentionSetList().get(i);
      for (EntityMention em : fromEms.getMentionList())
        toEms.addToMentionList(em);
    }
  }

  public static Communication mergeCommunicationsAsSections(List c) {
    Communication all = c.get(0);
    for (int i = 1; i < c.size(); i++) {
      Communication cc = c.get(i);
      if (!all.getId().equals(cc.getId())) {
        throw new IllegalArgumentException("not all ids match, these should be "
            + "sections from the same document and have the same id");
      }
      mergeInto(all, cc);
    }
    return all;
  }

  /**
   * Considers separate parts as separate {@link Conll2011Document}s. You need
   * to merge them into a single {@link Communication} using {@link Section}s.
   * @throws IOException
   */
  public Stream> preIngest() throws IOException {
    if (debug) {
      for (Path f : find(this.ingestPath, keep)) {
        System.out.println("ingestPath contains: " + f);
      };
    System.out.println("done listing files in " + this.ingestPath);
  }
  return find(this.ingestPath, keep).stream()
      .filter(this.keep)
      .map(this::readDocuments);
  }

  public static List find(Path root, Predicate keep) throws IOException {
    return find2(root.toFile(), f -> keep.test(f.toPath())).stream().map(File::toPath).collect(Collectors.toList());
  }

  public static List find2(File root, Predicate keep) {
    List all = new ArrayList<>();
    findHelper2(root, keep, all);
    return all;
  }

  private static void findHelper2(File root, Predicate keep, List addTo) {
    if (keep.test(root))
      addTo.add(root);
    File[] files = root.listFiles();
    if (files == null)
      return;
    for (File f : files)
      findHelper2(f, keep, addTo);
  }

  private List readDocuments(Path f) {
    LOGGER.debug("reading from {}", f.toString());
    try {
      List lines;
      try (Stream l = Files.lines(f, StandardCharsets.UTF_8)) {
        lines = l.collect(Collectors.toList());
      }

      List documents = new ArrayList<>();
      for (int i = 0; i < lines.size(); i = readDocument(f, lines, i, documents)) {
        if (i < 0)
          throw new RuntimeException();
      }

      if (debug) {
        System.out.println("read " + documents.size() + " documents from "+ f);
      }
      return documents;
    } catch (IOException e) {
      throw new UncheckedIOException(e);
    }
  }

  /**
   * @param f is only used for debugInfo, use lines for data
   * @param lines
   * @param start
   * @param addTo
   * @return
   */
  private int readDocument(Path f, List lines, int start, List addTo) {
    String header = lines.get(start);
    Matcher m = p.matcher(header);
    m.find();
    if (!m.matches()) {
      LOGGER.warn("prev=" + lines.get(start - 1));
      LOGGER.warn("head=" + header);
      LOGGER.warn("next=" + lines.get(start + 1));
      throw new RuntimeException();
    }

    // id without a part number
    // NOTE: Not using part numbers because the *.parse files don't have it
    String id = m.group(1);
    // Don't need to get the part info from here, it is in Conll2011Row
    //String part = m.group(2);

    // Count the number of tokens that appear before the tokens in the document
    // about to be read
    int sentenceIndex = 0;
    for (Conll2011Document d : addTo)
      sentenceIndex += d.getSentences().size();

    // I'm stripping the part number from the document ids so it is not unique!
    // Solution: later on these documents are all merged.

    String communicationType = "???";
    Conll2011Document doc = new Conll2011Document(this, id, communicationType);
    addTo.add(doc);
    List buf = new ArrayList<>();
    for (int i = start + 1; i < lines.size(); i = readSentence(f, lines, i, sentenceIndex++, buf)) {
      String line = lines.get(i);
      if (line.startsWith("#end document")) {
        for (Conll2011Sentence s : buf)
          doc.add(s);
        return i + 1;
      }
    }
    return -1;
  }

  /**
   * @param f is only used for debugInfo, use lines for data
   * @param lines
   * @param start
   * @param sentenceIndex
   * @param addTo
   * @return
   */
  private int readSentence(Path f, List lines, int start, int sentenceIndex, List addTo) {
    Conll2011Sentence s = new Conll2011Sentence(this, sentenceIndex);
    addTo.add(s);
    for (int i = start; i < lines.size(); i++) {
      String line = lines.get(i);
      if (line.isEmpty()) {
        return i + 1;
      }
      s.add(new Conll2011Row(line));
    }
    if (includeDebugInfo)
      s.debugInfo = new Conll2011Sentence.DebugInfo(f, start, start + s.size());
    return -1;
  }

  public static void main(String[] args) throws Exception {
    if (args.length != 3) {
      System.err.println("please provide:");
      System.err.println("1) an input directory of CoNLL data");
      System.err.println("2) an output Concrete tar gz file");
      System.err.println("3) suffix for the CoNLL files you're looking for (e.g. \".v4_gold_conll\")");
      return;
    }
    Path input = Paths.get(args[0]);
    Path output = Paths.get(args[1]);
    String suffix = args[2];
    if (Files.exists(output)) {
      throw new IllegalArgumentException(
          "output must not exist (this tool won't overwrite): " + output.toString());
    }
    System.out.println("reading from " + input.toString() + " looking for files that end in \"" + suffix + "\"");
    Conll2011 ingester = new Conll2011(input, x -> x.endsWith(suffix));
    Stream citer = ingester.stream();
    List comms = citer.collect(Collectors.toList());

    System.out.println("writing " + comms.size() + " Communications to " + output.toString());
    CommunicationTarGzSerializer ts = new TarGzCompactCommunicationSerializer();
    ts.toTarGz(comms, output);
    System.out.println("done");
  }

  @Override
  public String getKind() {
    return "document";
  }

  @Override
  public long getTimestamp() {
    return Timing.currentLocalTime();
  }

  @Override
  public String getTool() {
    return Conll2011.class.getSimpleName();
  }

  @Override
  public String getToolVersion() {
    return ProjectConstants.VERSION;
  }

  @Override
  public List getToolNotes() {
    return new ArrayList<>();
  }

  @Override
  public Stream stream() throws IngestException {
    try {
      return this.preIngest()
          // have Stream>
          // Convert each conll doc to communication
          .map(lcd -> {
              List comms = new ArrayList<>();
              for (Conll2011Document cd : lcd)
                comms.add(cd.convertToConcrete());
              return comms;
          })
          // now have Stream>
          // apply mergeCommunicationsAsSections
          .map(Conll2011::mergeCommunicationsAsSections)
          // map mergeTokensUp
          .map(Conll2011::projectTokenTextSpansUpwards);
    } catch (IOException e) {
      throw new IngestException(e);
    }
  }

  /**
   * Given a {@link Communication} which has {@link Token}s that have their
   * text field set, but not their {@link TextSpan}, build a String for the
   * entire document and create {@link TextSpan}s that point into that for
   * everything above {@link Token}. Sentences go on their own line, and their
   * is an empty line at the end of every section.
   *
   * NOTE: This method should only be used in cases where there is no original
   * text (e.g. CoNLL data which comes word-segmented).
   */
  public static Communication projectTokenTextSpansUpwards(Communication c) {
    if (c.isSetText())
      throw new IllegalArgumentException("text is already set");
    Communication cpy = new Communication(c);
    StringBuilder sb = new StringBuilder();
    for (Section sect : cpy.getSectionList()) {
      int sectionStart = sb.length();
      for (Sentence sent : sect.getSentenceList()) {
        Tokenization tok = sent.getTokenization();
        if (!TokenizationKind.TOKEN_LIST.equals(tok.getKind()))
          throw new IllegalArgumentException("only token lists are supported");
        int sentenceStart = sb.length();
        List toks = tok.getTokenList().getTokenList();
        for (int i = 0; i < toks.size(); i++) {
          if (i > 0)
            sb.append(' ');
          Token t = toks.get(i);
          if (!t.isSetText())
            throw new IllegalArgumentException("Token text is not set!");
          int start = sb.length();
          sb.append(t.getText());
          int end = sb.length();
          t.setTextSpan(new TextSpan(start, end));
        }
        int sentenceEnd = sb.length();
        if (sent.isSetTextSpan()) {
          boolean s = sentenceStart == sent.getTextSpan().getStart();
          boolean e = sentenceEnd == sent.getTextSpan().getEnding();
          if (!s || !e) {
            throw new RuntimeException("incompatible existing Sentence.textSpan!"
                + " existingStart=" + sent.getTextSpan().getStart()
                + " existingEnd=" + sent.getTextSpan().getEnding()
                + " computedStart=" + sentenceStart
                + " computedEnd=" + sentenceEnd);
          }
        } else {
          sent.setTextSpan(new TextSpan(sentenceStart, sentenceEnd));
        }
        sb.append('\n');
      }
      int sectionEnd = sb.length();
      if (sect.isSetTextSpan()) {
        boolean s = sectionStart == sect.getTextSpan().getStart();
        boolean e = sectionEnd == sect.getTextSpan().getEnding();
        if (!s || !e) {
          throw new RuntimeException("incompatible existing Sentence.textSpan!"
              + " existingStart=" + sect.getTextSpan().getStart()
              + " existingEnd=" + sect.getTextSpan().getEnding()
              + " computedStart=" + sectionStart
              + " computedEnd=" + sectionEnd);
        }
      } else {
        sect.setTextSpan(new TextSpan(sectionStart, sectionEnd));
      }
      sb.append('\n');
    }
    cpy.setText(sb.toString());
    return cpy;
  }
}