All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.pipeline.ChunkAnnotationUtils Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.AnnotationLookup;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.CoreTokenFactory;
import edu.stanford.nlp.util.*;
import edu.stanford.nlp.util.logging.Redwood;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Utility functions for annotating chunks
 *
 * @author Angel Chang
 */
public class ChunkAnnotationUtils  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels logger = Redwood.channels(ChunkAnnotationUtils.class);
  private static final CoreLabelTokenFactory tokenFactory = new CoreLabelTokenFactory(true);

  private ChunkAnnotationUtils() {} // static methods

  /**
   * Checks if offsets of doc and sentence matches.
   * @param docAnnotation The document Annotation to analyze
   * @return true if the offsets match, false otherwise
   */
  public static boolean checkOffsets(CoreMap docAnnotation) {
    boolean okay = true;
    String docText = docAnnotation.get(CoreAnnotations.TextAnnotation.class);
    String docId = docAnnotation.get(CoreAnnotations.DocIDAnnotation.class);
    List docTokens = docAnnotation.get(CoreAnnotations.TokensAnnotation.class);
    List sentences = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
    for (CoreMap sentence:sentences) {
      String sentText = sentence.get(CoreAnnotations.TextAnnotation.class);
      List sentTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      int sentBeginChar = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      int sentEndChar = sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
      int sentBeginToken = sentence.get(CoreAnnotations.TokenBeginAnnotation.class);
      int sentEndToken = sentence.get(CoreAnnotations.TokenEndAnnotation.class);
      String docTextSpan = docText.substring(sentBeginChar, sentEndChar);
      List docTokenSpan = new ArrayList<>(docTokens.subList(sentBeginToken, sentEndToken));
      logger.debug("Checking Document " + docId + " span (" + sentBeginChar + "," + sentEndChar + ") ");
      if (!docTextSpan.equals(sentText) ) {
        okay = false;
        logger.debug("WARNING: Document " + docId + " span does not match sentence");
        logger.debug("DocSpanText: " + docTextSpan);
        logger.debug("SentenceText: " + sentText);
      }
      String sentTokenStr = getTokenText(sentTokens, CoreAnnotations.TextAnnotation.class);
      String docTokenStr = getTokenText(docTokenSpan, CoreAnnotations.TextAnnotation.class);
      if (!docTokenStr.equals(sentTokenStr) ) {
        okay = false;
        logger.debug("WARNING: Document " + docId + " tokens does not match sentence");
        logger.debug("DocSpanTokens: " + docTokenStr);
        logger.debug("SentenceTokens: " + sentTokenStr);
      }
    }
    return okay;
  }

  /**
   * Fix token offsets of sentences to match those in the document (assumes tokens are shared)
   * sentence token indices may not match document token list if certain html elements are ignored.
   * @param docAnnotation The document Annotation to analyze
   * @return true if fix was okay, false otherwise
   */
  public static boolean fixTokenOffsets(CoreMap docAnnotation) {
    List docTokens = docAnnotation.get(CoreAnnotations.TokensAnnotation.class);
    List sentences = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
    int i = 0;
    CoreLabel curDocToken = docTokens.get(0);
    for (CoreMap sentence:sentences) {
      List sentTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
      CoreLabel sentTokenFirst = sentTokens.get(0);
      while (curDocToken != sentTokenFirst) {
        i++;
        if (i >= docTokens.size()) { return false; }
        curDocToken = docTokens.get(i);
      }
      int sentTokenBegin = i;
      CoreLabel sentTokenLast = sentTokens.get(sentTokens.size()-1);
      while (curDocToken != sentTokenLast) {
        i++;
        if (i >= docTokens.size()) { return false; }
        curDocToken = docTokens.get(i);
      }
      int sentTokenEnd = i+1;
      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, sentTokenBegin);
      sentence.set(CoreAnnotations.TokenEndAnnotation.class, sentTokenEnd);
    }
    return true;
  }


  /**
   * Copies annotation over to this CoreMap if not already set.
   */
  public static void copyUnsetAnnotations(CoreMap src, CoreMap dest) {
    for (Class key : src.keySet()) {
      if ( ! dest.containsKey(key)) {
        dest.set(key, src.get(key));
      }
    }
  }

  /**
   * Give an list of character offsets for chunk, fix tokenization so tokenization occurs at
   * boundary of chunks.
   * @param docAnnotation
   * @param chunkCharOffsets
   */
  public static boolean fixChunkTokenBoundaries(CoreMap docAnnotation, List chunkCharOffsets) {
    // First identify any tokens that need to be fixed
    String text = docAnnotation.get(CoreAnnotations.TextAnnotation.class);
    List tokens = docAnnotation.get(CoreAnnotations.TokensAnnotation.class);
    List output = new ArrayList<>(tokens.size());
    int i = 0;
    CoreLabel token = tokens.get(i);
    for (IntPair offsets:chunkCharOffsets) {
      assert(token.beginPosition() >= 0);
      assert(token.endPosition() >= 0);
      int offsetBegin = offsets.getSource();
      int offsetEnd = offsets.getTarget();
      // Find tokens where token begins after chunk starts
      // and token ends after chunk starts
      while (offsetBegin < token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)
              || offsetBegin >= token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
        output.add(token);
        i++;
        if (i >= tokens.size()) { return false; }
        token = tokens.get(i);
      }
      // offsetBegin is now >= token begin and < token end
      // go until we find a token that starts after our chunk has ended
      while (offsetEnd > token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
        // Check if chunk includes token
        if (offsetBegin > token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)) {
          // Chunk starts in the middle of the token
          if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
            output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin),
                    token.beginPosition(), offsetBegin-token.beginPosition()));
            output.add(tokenFactory.makeToken(text.substring(offsetBegin,offsetEnd),
                    offsetBegin, offsetEnd-offsetBegin));
            output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()),
                    offsetEnd, token.endPosition()-offsetEnd));
          } else {
            output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin),
                    token.beginPosition(), offsetBegin-token.beginPosition()));
            output.add(tokenFactory.makeToken(text.substring(offsetBegin,token.endPosition()),
                    offsetBegin, token.endPosition()-offsetBegin));
          }
        } else if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
          output.add(tokenFactory.makeToken(text.substring(token.beginPosition(),offsetEnd),
                  token.beginPosition(), offsetEnd-token.beginPosition()));
          output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd,
                  token.endPosition()-offsetEnd));
        } else {
          // success!  chunk contains token
          output.add(token);
        }
        i++;
        if (i >= tokens.size()) { return false; }
        token = tokens.get(i);
      }
    }
    // Add rest of the tokens
    for (; i < tokens.size(); i++) {
      token = tokens.get(i);
      output.add(token);
    }
    docAnnotation.set(CoreAnnotations.TokensAnnotation.class, output);
    return true;
  }

  /**
   * Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive).
   * @param chunkList - List of chunks
   * @param origText - Text from which to extract chunk text
   * @param chunkIndexStart - Index of first chunk to merge
   * @param chunkIndexEnd - Index of last chunk to merge (exclusive)
   * @param tokenFactory - factory for creating tokens (if we want to get a merged corelabel instead of something random)
   * @return new merged chunk
   */
  public static CoreMap getMergedChunk(List chunkList, String origText,
                                       int chunkIndexStart, int chunkIndexEnd, CoreLabelTokenFactory tokenFactory) {
    CoreMap firstChunk = chunkList.get(chunkIndexStart);
    CoreMap lastChunk = chunkList.get(chunkIndexEnd-1);
    int firstCharOffset = firstChunk.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
    int lastCharOffset = lastChunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
    int firstTokenIndex = firstChunk.get(CoreAnnotations.TokenBeginAnnotation.class);
    int lastTokenIndex = lastChunk.get(CoreAnnotations.TokenEndAnnotation.class);

    String chunkText = origText.substring(firstCharOffset, lastCharOffset);
    CoreMap newChunk;
    if (tokenFactory != null) {
      newChunk = tokenFactory.makeToken(chunkText, firstCharOffset, lastCharOffset);
    } else {
      newChunk = new Annotation(chunkText);
    }

    newChunk.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, firstCharOffset);
    newChunk.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, lastCharOffset);
    newChunk.set(CoreAnnotations.TokenBeginAnnotation.class, firstTokenIndex);
    newChunk.set(CoreAnnotations.TokenEndAnnotation.class, lastTokenIndex);
    List tokens = new ArrayList<>(lastTokenIndex - firstTokenIndex);
    for (int i = chunkIndexStart; i < chunkIndexEnd; i++) {
      CoreMap chunk = chunkList.get(i);
      tokens.addAll(chunk.get(CoreAnnotations.TokensAnnotation.class));
    }
    newChunk.set(CoreAnnotations.TokensAnnotation.class, tokens);
    // TODO: merge other keys into this new chunk ??

    return newChunk;
  }

  /**
   * Create chunk that is merged from chunkIndexStart to chunkIndexEnd (exclusive)
   * @param chunkList - List of chunks
   * @param chunkIndexStart - Index of first chunk to merge
   * @param chunkIndexEnd - Index of last chunk to merge (exclusive)
   * @param aggregators - Aggregators
   * @param tokenFactory - factory for creating tokens (if we want to get a merged corelabel instead of something random)
   * @return new merged chunk
   */
  public static CoreMap getMergedChunk(List chunkList,
                                       int chunkIndexStart, int chunkIndexEnd,
                                       Map aggregators,
                                       CoreLabelTokenFactory tokenFactory) {
    CoreMap newChunk;
    if (tokenFactory != null) {
      newChunk = tokenFactory.makeToken();
    } else {
      newChunk = new Annotation("");
    }
    for (Map.Entry entry:aggregators.entrySet()) {
      if (chunkIndexEnd > chunkList.size()) {
        assert(false);
      }
      Object value = entry.getValue().aggregate(entry.getKey(), chunkList.subList(chunkIndexStart, chunkIndexEnd));
      newChunk.set(entry.getKey(), value);
    }
    if (newChunk instanceof CoreLabel) {
      CoreLabel cl = (CoreLabel) newChunk;
      cl.setValue(cl.word());
      cl.setOriginalText(cl.word());
    }
    return newChunk;
  }

  /**
   * Return chunk offsets
   * @param chunkList - List of chunks
   * @param charStart - character begin offset
   * @param charEnd - character end offset
   * @return chunk offsets
   */
  public static Interval getChunkOffsetsUsingCharOffsets(List chunkList,
                                       int charStart, int charEnd) {
    int chunkStart = 0;
    int chunkEnd = chunkList.size();
    // Find first chunk with start > charStart
    for (int i = 0; i < chunkList.size(); i++) {
      int start = chunkList.get(i).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      if (start > charStart) {
        break;
      }
      chunkStart = i;
    }
    // Find first chunk with start >= charEnd
    for (int i = chunkStart; i < chunkList.size(); i++) {
      int start = chunkList.get(i).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      if (start >= charEnd) {
        chunkEnd = i;
        break;
      }
    }
    return Interval.toInterval(chunkStart, chunkEnd, Interval.INTERVAL_OPEN_END);
  }


  /**
   * Merge chunks from chunkIndexStart to chunkIndexEnd (exclusive) and replace them in the list.
   * @param chunkList - List of chunks
   * @param origText - Text from which to extract chunk text
   * @param chunkIndexStart - Index of first chunk to merge
   * @param chunkIndexEnd - Index of last chunk to merge (exclusive)
   */
  public static void mergeChunks(List chunkList, String origText,
                                 int chunkIndexStart, int chunkIndexEnd) {
    CoreMap newChunk = getMergedChunk(chunkList, origText, chunkIndexStart, chunkIndexEnd, null);
    int nChunksToRemove = chunkIndexEnd - chunkIndexStart - 1;
    for (int i = 0; i < nChunksToRemove; i++) {
      chunkList.remove(chunkIndexStart);
    }
    chunkList.set(chunkIndexStart, newChunk);
  }

  private static Character getFirstNonWsChar(CoreMap sent) {
    String sentText = sent.get(CoreAnnotations.TextAnnotation.class);
    for (int j = 0; j < sentText.length(); j++) {
      char c = sentText.charAt(j);
      if (!Character.isWhitespace(c)) {
        return c;
      }
    }
    return null;
  }

  private static Integer getFirstNonWsCharOffset(CoreMap sent, boolean relative) {
    String sentText = sent.get(CoreAnnotations.TextAnnotation.class);
    for (int j = 0; j < sentText.length(); j++) {
      char c = sentText.charAt(j);
      if (!Character.isWhitespace(c)) {
        if (relative) {
          return j;
        } else {
          return j + sent.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        }
      }
    }
    return null;
  }

  private static String getTrimmedText(CoreMap sent) {
    String sentText = sent.get(CoreAnnotations.TextAnnotation.class);
    return sentText.trim();
  }

  /**
   * Give an list of character offsets for chunk, fix sentence splitting
   * so sentences doesn't break the chunks.
   *
   * @param docAnnotation Document with sentences
   * @param chunkCharOffsets ordered pairs of different chunks that should appear in sentences
   * @return true if fix was okay (chunks are in all sentences), false otherwise
   */
  public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, List chunkCharOffsets) {
    return fixChunkSentenceBoundaries(docAnnotation, chunkCharOffsets, false, false, false);
  }

  /**
   * Give an list of character offsets for chunk, fix sentence splitting
   * so sentences doesn't break the chunks.
   *
   * @param docAnnotation Document with sentences
   * @param chunkCharOffsets ordered pairs of different chunks that should appear in sentences
   * @param offsetsAreNotSorted Treat each pair of offsets as independent (look through all sentences again)
   * @param extendedFixSentence Do extended sentence fixing based on some heuristics
   * @param moreExtendedFixSentence Do even more extended sentence fixing based on some heuristics
   * @return true if fix was okay (chunks are in all sentences), false otherwise
   */
  public static boolean fixChunkSentenceBoundaries(CoreMap docAnnotation, List chunkCharOffsets,
                                                   boolean offsetsAreNotSorted,
                                                   boolean extendedFixSentence, boolean moreExtendedFixSentence) {
    String text = docAnnotation.get(CoreAnnotations.TextAnnotation.class);
    List sentences = docAnnotation.get(CoreAnnotations.SentencesAnnotation.class);
    if (sentences == null || sentences.size() == 0) return true;
    if (chunkCharOffsets != null) {
      int i = 0;
      CoreMap sentence = sentences.get(i);
      for (IntPair offsets:chunkCharOffsets) {
        int offsetBegin = offsets.getSource();
        int offsetEnd = offsets.getTarget();
        // Find sentence where sentence begins after chunk starts
        // and sentence ends after chunk starts
        while (offsetBegin < sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)
                || offsetBegin >= sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
          i++;
          if (i >= sentences.size()) { return false; }
          sentence = sentences.get(i);
        }
        // offsetBegin is now >= sentence begin and < sentence end
        // Check if sentence end includes chunk
        if (sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) >= offsetEnd) {
          // success!  sentence contains chunk
        } else {
          // hmm, sentence contains beginning of chunk, but not end
          // Lets find sentence that contains end of chunk and merge sentences
          int startSentIndex = i;
          while (offsetEnd > sentence.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
            i++;
            if (i >= sentences.size()) { return false; }
            sentence = sentences.get(i);
          }
          Integer firstNonWsCharOffset = getFirstNonWsCharOffset(sentence, false);
          if (firstNonWsCharOffset != null && firstNonWsCharOffset >= offsetEnd) {
            // Ends before first real character of this sentence, don't include this sentence
            i--;
            sentence = sentences.get(i);
          }
          // Okay, now let's merge sentences from startSendIndex to i (includes i)
          mergeChunks(sentences, text, startSentIndex, i+1);
          // Reset our iterating index i to startSentIndex
          i = startSentIndex;
          sentence = sentences.get(i);
        }
        if (extendedFixSentence) {
          //log.info("Doing extended fixing of sentence:" + text.substring(offsetBegin,offsetEnd));
          if (i+1 < sentences.size()) {
            // Extended sentence fixing:
            // Check if entity is at the end of this sentence and if next sentence starts with uppercase
            // If not uppercase, merge with next sentence
            boolean entityAtSentEnd = true;
            int sentCharBegin = sentence.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
            String sentText = sentence.get(CoreAnnotations.TextAnnotation.class);
            int offsetEndInSentText = offsetEnd - sentCharBegin;
            for (int j = offsetEndInSentText; j < sentText.length(); j++) {
              char c = sentText.charAt(j);
              if (!Character.isWhitespace(c)) {
                entityAtSentEnd = false;
                break;
              }
            }
            boolean doMerge = false;
            if (entityAtSentEnd) {
              CoreMap nextSentence = sentences.get(i+1);
              Character c = getFirstNonWsChar(nextSentence);
              if (c != null) {
                doMerge = !Character.isUpperCase(c);
                if (!doMerge) {
                  logger.debug("No merge: c is '" + c + "'");
                }
              } else {
                logger.debug("No merge: no char");
              }
            } else {
              logger.debug("No merge: entity not at end");
            }
            if (doMerge) {
              logger.debug("Merge chunks");
              mergeChunks(sentences, text, i, i+2);
            }
          }
        }
        if (offsetsAreNotSorted) {
          i = 0;
        }
        sentence = sentences.get(i);
      }
    }
    // Do a bit more sentence fixing
    if (moreExtendedFixSentence) {
      int i = 0;
      while (i+1 < sentences.size()) {
        boolean doMerge = false;
        CoreMap sentence = sentences.get(i);
        CoreMap nextSentence = sentences.get(i+1);
        String sentTrimmedText = getTrimmedText(sentence);
        String nextSentTrimmedText = getTrimmedText(nextSentence);
        if (sentTrimmedText.length() <= 1 || nextSentTrimmedText.length() <= 1) {
          // Merge
          doMerge = true;
        } else {
 //         List sentTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
 //         CoreLabel lastSentToken = sentTokens.get(sentTokens.size()-1);
          Character c = getFirstNonWsChar(nextSentence);
 //         List nextSentTokens = nextSentence.get(CoreAnnotations.TokensAnnotation.class);
          if (c != null && !Character.isUpperCase(c)) {
            if (c == ',' || (Character.isLowerCase(c))) {
              doMerge = true;
            }
          }
        }
        if (doMerge) {
          mergeChunks(sentences, text, i, i+2);
        } else {
          i++;
        }
      }
    }
    // Set sentence indices
    for (int i = 0; i < sentences.size(); i++) {
      CoreMap sentence = sentences.get(i);
      sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, i);
    }
    return true;
  }

  /**
   * Annotates a CoreMap representing a chunk with basic chunk information.
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + totalTokenOffset
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + totalTokenOffset
   * @param chunk - CoreMap to be annotated
   * @param tokens - List of tokens to look for chunks
   * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts
   * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
   * @param totalTokenOffset - Index of tokens to offset by
   */
  public static void annotateChunk(CoreMap chunk,
                                   List tokens, int tokenStartIndex, int tokenEndIndex,  int totalTokenOffset) {
    List chunkTokens = new ArrayList<>(tokens.subList(tokenStartIndex, tokenEndIndex));
    chunk.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class,
            chunkTokens.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class));
    chunk.set(CoreAnnotations.CharacterOffsetEndAnnotation.class,
            chunkTokens.get(chunkTokens.size()-1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class));
    chunk.set(CoreAnnotations.TokensAnnotation.class, chunkTokens);
    chunk.set(CoreAnnotations.TokenBeginAnnotation.class, tokenStartIndex+totalTokenOffset);
    chunk.set(CoreAnnotations.TokenEndAnnotation.class, tokenEndIndex+totalTokenOffset);
  }

  public static String getTokenText(List tokens, Class tokenTextKey) {
    return getTokenText(tokens, tokenTextKey, " ");
  }

  public static String getTokenText(List tokens, Class tokenTextKey, String delimiter) {
    StringBuilder sb = new StringBuilder();
    int prevEndIndex = -1;
    for (CoreMap cm:tokens) {
      Object obj = cm.get(tokenTextKey);
      boolean includeDelimiter = sb.length() > 0;
      if (cm.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class) &&
        cm.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
        int beginIndex = cm.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
        int endIndex = cm.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
        if (prevEndIndex == beginIndex) {
          // No spaces
          includeDelimiter = false;
        }
        prevEndIndex = endIndex;
      }
      if (obj != null) {
        if (includeDelimiter) {
          sb.append(delimiter);
        }
        sb.append(obj);
      }
    }
    return sb.toString();
  }

  /**
   * Annotates a CoreMap representing a chunk with text information
   *   TextAnnotation - String representing tokens in this chunks (token text separated by space)
   * @param chunk - CoreMap to be annotated
   * @param tokenTextKey - Key to use to find the token text
   */
  public static void annotateChunkText(CoreMap chunk, Class tokenTextKey) {
    List chunkTokens = chunk.get(CoreAnnotations.TokensAnnotation.class);
    String text = getTokenText(chunkTokens, tokenTextKey);
    chunk.set(CoreAnnotations.TextAnnotation.class, text);
  }

  public static boolean hasCharacterOffsets(CoreMap chunk) {
    return chunk.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) != null &&
            chunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) != null;
  }

  /**
   * Annotates a CoreMap representing a chunk with text information
   *   TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
   * @param chunk - CoreMap to be annotated
   * @param origAnnotation - Annotation from which to extract the text for this chunk
   */
  public static boolean annotateChunkText(CoreMap chunk, CoreMap origAnnotation) {
    String annoText = origAnnotation.get(CoreAnnotations.TextAnnotation.class);
    if (annoText == null) return false;
    if (!hasCharacterOffsets(chunk)) return false;
    Integer annoBeginCharOffset = origAnnotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
    if (annoBeginCharOffset == null) { annoBeginCharOffset = 0; }
    int chunkBeginCharOffset = chunk.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) - annoBeginCharOffset;
    int chunkEndCharOffset = chunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) - annoBeginCharOffset;
    if (chunkBeginCharOffset < 0) {
      logger.debug("Adjusting begin char offset from " + chunkBeginCharOffset + " to 0");
      logger.debug("Chunk begin offset: " + chunk.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) +
        ", Source text begin offset " + annoBeginCharOffset);
      chunkBeginCharOffset = 0;
    }
    if (chunkBeginCharOffset > annoText.length()) {
      logger.debug("Adjusting begin char offset from " + chunkBeginCharOffset + " to " + annoText.length());
      logger.debug("Chunk begin offset: " + chunk.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) +
        ", Source text begin offset " + annoBeginCharOffset);
      chunkBeginCharOffset = annoText.length();
    }
    if (chunkEndCharOffset < 0) {
      logger.debug("Adjusting end char offset from " + chunkEndCharOffset + " to 0");
      logger.debug("Chunk end offset: " + chunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) +
        ", Source text begin offset " + annoBeginCharOffset);
      chunkEndCharOffset = 0;
    }
    if (chunkEndCharOffset > annoText.length()) {
      logger.debug("Adjusting end char offset from " + chunkEndCharOffset + " to " + annoText.length());
      logger.debug("Chunk end offset: " + chunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) +
        ", Source text begin offset " + annoBeginCharOffset);
      chunkEndCharOffset = annoText.length();
    }
    if (chunkEndCharOffset < chunkBeginCharOffset) {
      logger.debug("Adjusting end char offset from " + chunkEndCharOffset + " to " + chunkBeginCharOffset);
      logger.debug("Chunk end offset: " + chunk.get(CoreAnnotations.CharacterOffsetEndAnnotation.class) +
        ", Source text begin offset " + annoBeginCharOffset);
      chunkEndCharOffset = chunkBeginCharOffset;
    }
    String chunkText = annoText.substring(chunkBeginCharOffset, chunkEndCharOffset);
    chunk.set(CoreAnnotations.TextAnnotation.class, chunkText);
    return true;
  }

  /**
   * Annotates tokens in chunk.
   * @param chunk - CoreMap representing chunk (should have TextAnnotation and TokensAnnotation)
   * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key
   * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key
   */
  public static void annotateChunkTokens(CoreMap chunk, Class tokenChunkKey, Class tokenLabelKey) {
    List chunkTokens = chunk.get(CoreAnnotations.TokensAnnotation.class);
    if (tokenLabelKey != null) {
      String text = chunk.get(CoreAnnotations.TextAnnotation.class);
      for (CoreLabel t: chunkTokens) {
        t.set(tokenLabelKey, text);
      }
    }
    if (tokenChunkKey != null) {
      for (CoreLabel t: chunkTokens) {
        t.set(tokenChunkKey, chunk);
      }
    }
  }

  /**
   * Create a new chunk Annotation with basic chunk information.
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + totalTokenOffset
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + totalTokenOffset
   * @param tokens - List of tokens to look for chunks
   * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts
   * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
   * @param totalTokenOffset - Index of tokens to offset by
   * @return Annotation representing new chunk
   */
  public static Annotation getAnnotatedChunk(List tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset) {
    Annotation chunk = new Annotation("");
    annotateChunk(chunk, tokens, tokenStartIndex, tokenEndIndex, totalTokenOffset);
    return chunk;
  }

  /**
   * Create a new chunk Annotation with basic chunk information.
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + totalTokenOffset
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + totalTokenOffset
   *   TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
   * @param tokens - List of tokens to look for chunks
   * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts
   * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
   * @param totalTokenOffset - Index of tokens to offset by
   * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key
   * @param tokenTextKey - Key to use to find the token text
   * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key
   * @return Annotation representing new chunk
   */
  public static Annotation getAnnotatedChunk(List tokens, int tokenStartIndex, int tokenEndIndex, int totalTokenOffset,
                                             Class tokenChunkKey, Class tokenTextKey,  Class tokenLabelKey) {
    Annotation chunk = getAnnotatedChunk(tokens, tokenStartIndex, tokenEndIndex, totalTokenOffset);
    annotateChunkText(chunk, tokenTextKey);
    annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey);
    return chunk;
  }

  /**
   * Create a new chunk Annotation with basic chunk information
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + annotation's TokenBeginAnnotation
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + annotation's TokenBeginAnnotation
   *   TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
   * @param annotation - Annotation from which to extract the text for this chunk
   * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts
   * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
   * @return Annotation representing new chunk
   */
  public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex) {
    Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
    if (annoTokenBegin == null) { annoTokenBegin = 0; }
    List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    Annotation chunk = getAnnotatedChunk(tokens, tokenStartIndex, tokenEndIndex, annoTokenBegin);
    boolean annotatedTextFromCharOffsets = annotateChunkText(chunk, annotation);
    if (!annotatedTextFromCharOffsets) {
      // Use tokens to get text annotation
      annotateChunkText(chunk, CoreAnnotations.TextAnnotation.class);
    }
    return chunk;
  }

  /**
   * Create a new chunk Annotation with basic chunk information
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + annotation's TokenBeginAnnotation
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + annotation's TokenBeginAnnotation
   *   TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
   * @param annotation - Annotation from which to extract the text for this chunk
   * @param tokenStartIndex - Index (relative to current list of tokens) at which this chunk starts
   * @param tokenEndIndex - Index (relative to current list of tokens) at which this chunk ends (not inclusive)
   * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key
   * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key
   * @return Annotation representing new chunk
   */
  public static Annotation getAnnotatedChunk(CoreMap annotation, int tokenStartIndex, int tokenEndIndex,
                                             Class tokenChunkKey, Class tokenLabelKey) {
    Annotation chunk = getAnnotatedChunk(annotation, tokenStartIndex, tokenEndIndex);
    annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey);
    return chunk;
  }

  /** Returns a chunk annotation based on char offsets.
   *
   * @param annotation Annotation from which to extract the text for this chunk
   * @param charOffsetStart Start character offset
   * @param charOffsetEnd End (not inclusive) character offset
   * @return An Annotation representing the new chunk. Or {@code null} if no chunk matches offsets.
   */
  public static CoreMap getAnnotatedChunkUsingCharOffsets(CoreMap annotation, int charOffsetStart, int charOffsetEnd) {
    // TODO: make more efficient search
    List cm = getAnnotatedChunksUsingSortedCharOffsets(annotation,
            CollectionUtils.makeList(new IntPair(charOffsetStart, charOffsetEnd)));
    if ( ! cm.isEmpty()) {
      return cm.get(0);
    } else {
      return null;
    }
  }

  public static List getAnnotatedChunksUsingSortedCharOffsets(
          CoreMap annotation, List charOffsets) {
    return getAnnotatedChunksUsingSortedCharOffsets(annotation, charOffsets, true, null, null, true);
  }

  /**
   * Create a list of new chunk Annotation with basic chunk information.
   *   CharacterOffsetBeginAnnotation - set to CharacterOffsetBeginAnnotation of first token in chunk
   *   CharacterOffsetEndAnnotation - set to CharacterOffsetEndAnnotation of last token in chunk
   *   TokensAnnotation - List of tokens in this chunk
   *   TokenBeginAnnotation - Index of first token in chunk (index in original list of tokens)
   *                          tokenStartIndex + annotation's TokenBeginAnnotation
   *   TokenEndAnnotation - Index of last token in chunk (index in original list of tokens)
   *                          tokenEndIndex + annotation's TokenBeginAnnotation
   *   TextAnnotation - String extracted from the origAnnotation using character offset information for this chunk
   *
   * @param annotation Annotation from which to extract the text for this chunk
   * @param charOffsets - List of start and end (not inclusive) character offsets
   *                      Note: assume char offsets are sorted and non-overlapping!!!
   * @param charOffsetIsRelative - Whether the character offsets are relative to the current annotation or absolute offsets
   * @param tokenChunkKey - If not null, each token is annotated with the chunk using this key
   * @param tokenLabelKey - If not null, each token is annotated with the text associated with the chunk using this key
   * @param allowPartialTokens - Whether to allow partial tokens or not
   * @return List of Annotation representing new chunks; may be empty never null
   */
  public static List getAnnotatedChunksUsingSortedCharOffsets(
          CoreMap annotation, List charOffsets, boolean charOffsetIsRelative,
          Class tokenChunkKey, Class tokenLabelKey, boolean allowPartialTokens) {
    String annoText = annotation.get(CoreAnnotations.TextAnnotation.class);
    List chunks = new ArrayList<>(charOffsets.size());
    List annoTokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
    Integer annoCharBegin = annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
    if (annoCharBegin == null) { annoCharBegin = 0; }
    Integer annoTokenBegin = annotation.get(CoreAnnotations.TokenBeginAnnotation.class);
    if (annoTokenBegin == null) { annoTokenBegin = 0; }
    int i = 0;
    for (IntPair p:charOffsets) {
      int beginRelCharOffset = charOffsetIsRelative? p.getSource():p.getSource()-annoCharBegin;
      int endRelCharOffset = charOffsetIsRelative? p.getTarget():p.getTarget()-annoCharBegin;
      int beginCharOffset = beginRelCharOffset + annoCharBegin;
      int endCharOffset = endRelCharOffset + annoCharBegin;
      if (beginRelCharOffset >= annoText.length()) { break; }
      if (endRelCharOffset > annoText.length()) { endRelCharOffset = annoText.length(); }
      if (allowPartialTokens) {
        while (i < annoTokens.size() && annoTokens.get(i).endPosition() <= beginCharOffset) {
          i++;
        }
      } else {
        while (i < annoTokens.size() && annoTokens.get(i).beginPosition() < beginCharOffset) {
          i++;
        }
      }
      if (i >= annoTokens.size()) break;
      int tokenBegin = i;
      int j = i;
      if (allowPartialTokens) {
        while (j < annoTokens.size() && annoTokens.get(j).beginPosition() < endCharOffset) {
          j++;
        }
      } else {
        while (j < annoTokens.size() && annoTokens.get(j).endPosition() <= endCharOffset) {
          assert(annoTokens.get(j).beginPosition() >= beginCharOffset);
          j++;
        }
      }
      int tokenEnd = j;

      List chunkTokens = new ArrayList<>(annoTokens.subList(tokenBegin, tokenEnd));
      String chunkText = annoText.substring(beginRelCharOffset, endRelCharOffset);
      Annotation chunk = new Annotation(chunkText);
      chunk.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, beginCharOffset);
      chunk.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, endCharOffset);
      chunk.set(CoreAnnotations.TokensAnnotation.class, chunkTokens);
      chunk.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBegin + annoTokenBegin);
      chunk.set(CoreAnnotations.TokenEndAnnotation.class, tokenEnd + annoTokenBegin);
      annotateChunkTokens(chunk, tokenChunkKey, tokenLabelKey);
      chunks.add(chunk);
      if (j >= annoTokens.size()) break;
    }
    if (chunks.size() != charOffsets.size()) {
      logger.warning("WARNING: Only " + chunks.size() + "/" + charOffsets.size()
              + " chunks found.  Check if offsets are sorted/nonoverlapping");
    }
    return chunks;
  }

  public static void annotateChunk(CoreMap annotation, Class newAnnotationKey,
                                   Class aggrKey, CoreMapAttributeAggregator aggregator) {
    Object v = aggregator.aggregate(aggrKey, annotation.get(CoreAnnotations.TokensAnnotation.class));
    annotation.set(newAnnotationKey, v);
  }

  public static void annotateChunk(CoreMap chunk, Map attributes) {
    for (Map.Entry entry : attributes.entrySet()) {
      String key = entry.getKey();
      String value = entry.getValue();
      Class coreKeyClass = AnnotationLookup.toCoreKey(key);
      if (key != null) {
        if (value != null)  {
          try {
            Class valueClass = AnnotationLookup.getValueType(coreKeyClass);
            if (valueClass == String.class) {
              chunk.set(coreKeyClass, value);
            } else {
             Method valueOfMethod = valueClass.getMethod("valueOf", String.class);
              if (valueOfMethod != null) {
                chunk.set(coreKeyClass, valueOfMethod.invoke(valueClass, value));
              }
            }
          } catch (Exception ex) {
            throw new RuntimeException("Unable to annotate attribute " + key, ex);
          }
        } else {
          chunk.set(coreKeyClass, null);
        }
      } else {
        throw new UnsupportedOperationException("Unknown null attribute.");
      }
    }
  }

  public static void annotateChunks(List chunks, int start, int end, Map attributes) {
    for (int i = start; i < end; i++) {
      annotateChunk(chunks.get(i), attributes);
    }
  }

  public static void annotateChunks(List chunks, Map attributes) {
    for (CoreMap chunk:chunks) {
      annotateChunk(chunk, attributes);
    }
  }

  public static  T createCoreMap(CoreMap cm, String text, int start, int end,
                                                    CoreTokenFactory factory) {
    if (end > start) {
      T token = factory.makeToken();
      Integer cmCharStart = cm.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
      if (cmCharStart == null) cmCharStart = 0;
      String tokenText = text.substring(start, end);
      token.set(CoreAnnotations.TextAnnotation.class, tokenText);
      if (token instanceof CoreLabel) {
        token.set(CoreAnnotations.ValueAnnotation.class, tokenText);
      }
      token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, cmCharStart + start);
      token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, cmCharStart + end);
      return token;
    } else {
      return null;
    }
  }

  public static  void appendCoreMap(List res,
                                                       CoreMap cm, String text, int start, int end,
                                                       CoreTokenFactory factory) {
    T scm = createCoreMap(cm, text, start, end, factory);
    if (scm != null) {
      res.add(scm);
    }
  }

  public static  List splitCoreMap(Pattern p, boolean includeMatched,
                                                         CoreMap cm, CoreTokenFactory factory) {
    List res = new ArrayList<>();
    String text = cm.get(CoreAnnotations.TextAnnotation.class);
    Matcher m = p.matcher(text);
    int index = 0;
    while (m.find()) {
      int start = m.start();
      int end = m.end();
      // Include characters from index to m.start()
      appendCoreMap(res, cm, text, index, start, factory);
      // Include matched pattern
      if (includeMatched) {
        appendCoreMap(res, cm, text, start, end, factory);
      }
      index = end;
    }
    appendCoreMap(res, cm, text, index, text.length(), factory);
    return res;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy