com.bbn.bue.common.strings.LocatedString Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of common-core-open Show documentation
There is a newer version: 4.1.2
package com.bbn.bue.common.strings;

import com.bbn.bue.common.strings.offsets.ASRTime;
import com.bbn.bue.common.strings.offsets.ByteOffset;
import com.bbn.bue.common.strings.offsets.CharOffset;
import com.bbn.bue.common.strings.offsets.EDTOffset;
import com.bbn.bue.common.strings.offsets.OffsetGroup;
import com.bbn.bue.common.strings.offsets.OffsetGroupRange;
import com.bbn.bue.common.strings.offsets.OffsetRange;

import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;

import java.util.List;
import java.util.NoSuchElementException;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

/**
 * * Class for storing and manipulating strings that have been read in from a file, without losing
 * the relationship between each character and its origin in the file from which it was read.  In
 * particular, for each character in the located string, we record a start offset and an end offset
 * of each offset type (ByteOffset, CharOffset, EDTOffset, and ASRTime).  Start offsets and end
 * offsets are zero- indexed, and both are inclusive.  E.g., if a character in the string came from
 * a single byte at position 12, then that character's start ByteOffset and end ByteOffset will both
 * be 12.  For a character that was encoded using three bytes at positions 14, 15, and 16, the start
 * ByteOffset will be 14, and the end ByteOffset will be 16.
 *
 * In unmodified LocatedStrings, the start CharOffset for each character will be equal to its end
 * CharOffset.  However, modifications that replace substrings can result in individual characters
 * whose start and end offsets are not equal, since the offsets of the replacement characters are
 * set based on the entire range of characters in the replaced substring.
 *
 * The four offset types that are currently stored for each character are:
 *
 * - CharOffset.  More accurately, this is a unicode code point offset.
 *
 * - ByteOffset.  Currently, we assume that the source string was UTF-8, and calculate byte offsets
 * by checking how many bytes it would take to encode each character.  In the future, if we did our
 * own unicode encoding, we could directly read byte offsets for other encodings.
 *
 * - EDTOffset.  EDT offsets are similra to character offsets, except that (i) any substrings
 * starting with "<" and extending to the matching ">" are skipped when counting offsets; and (ii)
 * the character "\r" is skipped when counting offsets. Note that condition (i) is *not* always
 * identical to skipping XML/SGML tags and comments.
 *
 * - ASRTime.  The start and end time of the speech signal that corresponds to a character.  ASRTime
 * must be set after a LocatedString is constructed, using setAsrStartTime() and setAsrEndTime().
 *
 * @author originally by David A. Herman, refactored by Edward Loper; translated to Java by Ryan
 *         Gabbard
 * @author rgabbard
 */
public final class LocatedString {

  public String text() {
    return content;
  }

  public int length() {
    return content.length();
  }

  /**
   * ****************************************************************** Offset accessors
   * *******************************************************************
   */
  public EDTOffset startEDTOffset() {
    return bounds.startInclusive().edtOffset();
  }

  public EDTOffset endEDTOffset() {
    return bounds.endInclusive().edtOffset();
  }

  public CharOffset startCharOffset() {
    return bounds.startInclusive().charOffset();
  }

  public CharOffset endCharOffset() {
    return bounds.endInclusive().charOffset();
  }

  public OffsetGroupRange bounds() {
    return bounds;
  }

  public List offsetEntries() {
    return offsets;
  }

  public static LocatedString forString(final String text) {
    final OffsetGroup initialOffsets = OffsetGroup.from(new ByteOffset(0), new CharOffset(0),
        EDTOffset.asEDTOffset(0));
    return forString(text, initialOffsets);
  }

  public static LocatedString forString(final String text, final OffsetGroupRange bounds,
      final List spanOffsets) {
    return new LocatedString(text, spanOffsets, bounds);
  }

  public static LocatedString forString(final String text, final OffsetGroup initialOffsets) {
    return forString(text, initialOffsets, false);
  }

  public static LocatedString forString(final String text, final OffsetGroup initialOffsets,
      final boolean EDTOffsetsAreCharOffsets) {
    final List offsets =
        calculateOffsets(text, initialOffsets, EDTOffsetsAreCharOffsets);
    final OffsetGroupRange bounds = boundsFromOffsets(offsets);
    return new LocatedString(text, offsets, bounds);
  }

  /**
   * Return a LocatedString substring of this string.
   *
   * NOTE: Because it recomputes the various offsets of every character in the
   * substring, this method is *significantly* more expensive than just
   * fetching the String content of the substring.  If you just need the String
   * content, you should use rawSubstring() instead.
   */
  public LocatedString substring(final OffsetGroup start, final OffsetGroup end) {
    return substring(start.charOffset(), end.charOffset());
  }

  /**
   * Return a LocatedString substring of this string.
   *
   * NOTE: Because it recomputes the various offsets of every character in the
   * substring, this method is *significantly* more expensive than just
   * fetching the String content of the substring.  If you just need the String
   * content, you should use rawSubstring() instead.
   */
  public LocatedString substring(final CharOffset start, final CharOffset end) {
    final int startOffset = start.value() - bounds.startInclusive().charOffset().value();
    final int endOffset = end.value() - bounds.startInclusive().charOffset().value() + 1;

    return substring(startOffset, endOffset);
  }

	/*public LocatedString substringConvertP(OffsetGroup start, OffsetGroup end) {
                final int startOffset = start.charOffset().value() - bounds.start.charOffset().value();
		final int endOffset = end.charOffset().value() - bounds.start.charOffset().value() + 1;

		return substringConvertP(startOffset, endOffset);
	}

	public LocatedString substringConvertP(int startIndexInclusive, int endIndexExclusive) {
		final String text = content.substring(startIndexInclusive, endIndexExclusive);
		final String text2 = text.replace("", "");
		final List offsets = offsetsOfSubstring(startIndexInclusive, endIndexExclusive);
		final OffsetRange bounds = boundsFromOffsets(offsets);
		System.out.println(new LocatedString(text2, offsets, bounds));
		return new LocatedString(text2, offsets, bounds);
	}*/

  /**
   * Return a LocatedString substring of this string.
   *
   * NOTE: Because it recomputes the various offsets of every character in the
   * substring, this method is *significantly* more expensive than just
   * fetching the String content of the substring.  If you just need the String
   * content, you should use rawSubstring() instead.
   */
  public LocatedString substring(final int startIndexInclusive, final int endIndexExclusive) {
    final String text = content.substring(startIndexInclusive, endIndexExclusive);
    final List offsets = offsetsOfSubstring(startIndexInclusive, endIndexExclusive);
    final OffsetGroupRange bounds = boundsFromOffsets(offsets);
    return new LocatedString(text, offsets, bounds);
  }

  /**
   * Return a String substring of this string.
   *
   * @param start
   * @param end
   * @return
   */
  public String rawSubstring(final OffsetGroup start, final OffsetGroup end) {
    return rawSubstring(start.charOffset(), end.charOffset());
  }

  /**
   * Return a String substring of this string.
   *
   * @param start
   * @param end
   * @return
   */
  public String rawSubstring(final CharOffset start, final CharOffset end) {
    final int startOffset = start.value() - bounds.startInclusive().charOffset().value();
    final int endOffset = end.value() - bounds.startInclusive().charOffset().value() + 1;

    return rawSubstring(startOffset, endOffset);
  }

  /**
   * Return a String substring of this string.
   *
   * @param startIndexInclusive
   * @param endIndexExclusive
   * @return
   */
  public String rawSubstring(final int startIndexInclusive, final int endIndexExclusive) {
    return content.substring(startIndexInclusive, endIndexExclusive);
  }

  /**
   * Returns the earliest offset group within this {@code LocatedString} whose character offset
   * matches the one supplied. If not such offset group exists, throws a {@link
   * NoSuchElementException}.
   */
  public OffsetGroup offsetGroupForCharOffset(final CharOffset offset) {
    // if this ever slows us down significantly, we can binary search
    for (final OffsetEntry entry : offsets) {
      if (entry.startOffset.charOffset().value() <= offset.value()
          && entry.endOffset.charOffset().value() > offset.value()) {
        // we assume EDT offsets are continuous witihn entries
        final int offsetWithinEntry = offset.value() - entry.startOffset.charOffset().value();

        return OffsetGroup
            .from(offset, new EDTOffset(entry.startOffset.edtOffset().value() + offsetWithinEntry));
      }
    }
    throw new NoSuchElementException();
  }

  public boolean contains(LocatedString other) {
    // TODO: we do it this way because the C++ is implemented this way,
    // so implementing isSubstringOf is an easy, less error-prone
    // translation. But .contains() is more idiomatic Java.
    return other.isSubstringOf(this);
  }

  /**
   * finds the position of the first offset entry of this object which has an identical char offset to oe
   *
   * preserves the CPP interface, more or less
   * @param charOffset
   * @return
   */
  private int positionOfStartOffsetChar(final CharOffset charOffset) {
    for(final OffsetEntry it: offsetEntries()) {
      if(it.startOffset().charOffset().asInt() > charOffset.asInt()) {
        return -1;
      }
      if(charOffset.asInt() <= it.endOffset().charOffset().asInt()) {
        return it.startPos() + (charOffset.asInt() - it.startOffset().charOffset().asInt());
      }
    }
    return -1;
  }

  private CharOffset getStartOffset(int pos) {
    final OffsetEntry oe = offsetEntries().get(lastEntryStartingBefore(pos));
    checkArgument(pos >= oe.startPos() && pos <= oe.endPos() - 1);
    if(pos == oe.startPos()) {
      return oe.startOffset().charOffset();
    } else {
      return CharOffset.asCharOffset(oe.startOffset().charOffset().asInt() + (pos - oe.startPos()));
    }
  }

  private CharOffset getEndOffset(int pos) {
    final OffsetEntry oe = offsetEntries().get(lastEntryStartingBefore(pos));
    checkArgument(pos >= oe.startPos() && pos <= oe.endPos());
    if(pos == oe.endPos() -1) {
      return oe.endOffset().charOffset();
    } else {
      return CharOffset.asCharOffset(oe.startOffset().charOffset().asInt() + (pos - oe.startPos()));
    }
  }

  private boolean isSubstringOf(LocatedString sup) {
    final int superStringStartPos =
        sup.positionOfStartOffsetChar(offsetEntries().get(0).startOffset().charOffset());
    if (superStringStartPos < 0) {
      return false;
    }
    if (superStringStartPos + length() > sup.length()) {
      return false;
    }

    final OffsetRange thisCharOffsets = this.bounds().asCharOffsetRange();
    if (thisCharOffsets.startInclusive().asInt() != sup.getStartOffset(superStringStartPos).asInt()) {
      return false;
    }
    if (thisCharOffsets.endInclusive().asInt() != sup.getEndOffset(superStringStartPos + this.length()).asInt()-1) {
      return false;
    }
    //TODO: if this is slow, do a point by point comparison instead of substring
    if (!sup.content.substring(superStringStartPos, superStringStartPos + this.length()).equals(
        content)) {
      return false;
    }
    return true;
  }

  /**
   * ***************************************************************************** Private
   * implementation
   */

  private final String content;
  private final OffsetGroupRange bounds;
  private final List offsets;

  private LocatedString(final String content, final List offsets,
      final OffsetGroupRange bounds) {
    this.content = content;
    this.bounds = bounds;
    // since this is a private constructor, no need to defensively copy to preserve immutability
    this.offsets = offsets;
  }

  @Override
  public int hashCode() {
    return Objects.hashCode(content, bounds, offsets);
  }

  @Override
  /**
   * Equality for this is quite strict - it must be exactly the same string and offsets
   * with exactly the same interior material omitted, if any.
   */
  public boolean equals(Object obj) {
    if (this == obj) {
      return true;
    }
    if (obj == null || getClass() != obj.getClass()) {
      return false;
    }
    final LocatedString other = (LocatedString) obj;
    return Objects.equal(this.bounds, other.bounds) && Objects.equal(this.content, other.content)
        && Objects.equal(this.offsets, other.offsets);
  }

  public static class OffsetEntry {

    private final int startPos;
    private final int endPos;
    private final OffsetGroup startOffset;
    private final OffsetGroup endOffset;
    private final boolean isEDTSkipRegion;

    public OffsetEntry(final int startPos, final int endPos, final OffsetGroup startOffset,
        final OffsetGroup endOffset, final boolean isEDTSkipRegion) {
      this.startPos = startPos;
      this.endPos = endPos;
      this.startOffset = startOffset;
      this.endOffset = endOffset;
      this.isEDTSkipRegion = isEDTSkipRegion;
    }

    public int startPos() {
      return startPos;
    }

    public int endPos() {
      return endPos;
    }

    public OffsetGroup startOffset() {
      return startOffset;
    }

    public OffsetGroup endOffset() {
      return endOffset;
    }

    public boolean isEDTSkipRegion() {
      return isEDTSkipRegion;
    }

    @Override
    public String toString() {
      return "start: " + startPos +
          "\nend: " + endPos +
          "\nstartOffset: " + startOffset +
          "\nendOffset: " + endOffset +
          "\nEDTSkip: " + isEDTSkipRegion;
    }

    @Override
    public int hashCode() {
      return Objects.hashCode(startPos, endPos, startOffset, endOffset, isEDTSkipRegion);
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      }
      if (obj == null || getClass() != obj.getClass()) {
        return false;
      }
      final OffsetEntry other = (OffsetEntry) obj;
      return Objects.equal(this.startPos, other.startPos) && Objects
          .equal(this.endPos, other.endPos) && Objects.equal(this.startOffset, other.startOffset)
          && Objects.equal(this.endOffset, other.endOffset) && Objects
          .equal(this.isEDTSkipRegion, other.isEDTSkipRegion);
    }
  }

  private static List calculateOffsets(final String text,
      final OffsetGroup initialOffsets, final boolean EDTOffsetsAreCharOffsets) {
    checkNotNull(text);
    checkNotNull(initialOffsets);

    final ImmutableList.Builder offsets = ImmutableList.builder();

    final Optional weDontKnowASRTime = Optional.absent();
    int inTag = 0;
    boolean useByteOffsets = initialOffsets.byteOffset().isPresent();
    int byteOffset = useByteOffsets ? initialOffsets.byteOffset().get().value() : Integer.MIN_VALUE;
    int charOffset = initialOffsets.charOffset().value();
    int edtOffset = initialOffsets.edtOffset().value();

    int pos = 0;
    int startPos = 0;
    boolean justLeftXMLTag = false;
    char prevChar = 0;
    OffsetGroup start = initialOffsets;

    // TODO: figure out how this works with UTF-16 unicode encoding...
    for (; pos < text.length(); ++pos) {
      final char c = text.charAt(pos);
      if (!EDTOffsetsAreCharOffsets && pos > 0 &&
          (inTag == 0 && (c == '<' || prevChar == '\r') || justLeftXMLTag)
          && !(justLeftXMLTag && c == '<')) {
        final int prevEDTOffset =
            (edtOffset == 0 || prevChar == '\r') ? edtOffset : (edtOffset - 1);
        offsets.add(
            new OffsetEntry(startPos, pos, start,
                OffsetGroup.from(useByteOffsets ? new ByteOffset(byteOffset - 1) : null,
                new CharOffset(charOffset - 1), EDTOffset.asEDTOffset(prevEDTOffset)), justLeftXMLTag));
        startPos = pos;
        final int startEDTOffset = (c == '<') ? edtOffset - 1 : edtOffset;
        start = OffsetGroup
            .from(useByteOffsets ? new ByteOffset(byteOffset) : null, new CharOffset(charOffset),
            EDTOffset.asEDTOffset(startEDTOffset));
      }

      ++charOffset;
      byteOffset += UTF8BytesInChar(c);
      if (EDTOffsetsAreCharOffsets || (!(inTag != 0 || c == '<' || c == '\r'))) {
        ++edtOffset;
      }
      if (!EDTOffsetsAreCharOffsets) {
        justLeftXMLTag = false;
        if (c == '<') {
          ++inTag;
        } else if (inTag > 0 && c == '>') {
          --inTag;
          if (inTag == 0) {
            justLeftXMLTag = true;
          }
        }
      }
      prevChar = c;
    }
    if (pos > startPos) {
      final int prevEDTOffset = Math.max(start.edtOffset().value(), edtOffset - 1);
      offsets.add(new OffsetEntry(startPos, pos, start,
          OffsetGroup.from(useByteOffsets ? new ByteOffset(byteOffset - 1) : null,
              new CharOffset(charOffset - 1),
              EDTOffset.asEDTOffset(prevEDTOffset)), inTag > 0 || justLeftXMLTag));
    }
    return offsets.build();
  }

  private static OffsetGroupRange boundsFromOffsets(final List offsets) {
    checkArgument(!offsets.isEmpty());
    return OffsetGroupRange
        .from(offsets.get(0).startOffset, offsets.get(offsets.size() - 1).endOffset);
  }

  private static final char ONE_BYTE = 0x007f;
  private static final char TWO_BYTE = 0x07ff;
  private static final char THREE_BYTE = 0xffff;

  private static final int UTF8BytesInChar(final char c) {
    if (c <= ONE_BYTE) {
      return 1;
    } else if (c <= TWO_BYTE) {
      return 2;
    } else if (c <= THREE_BYTE) {
      return 3;
    } else {
      return 4;
    }
  }

  /**
   * Returns offsets corresponding to substring, in order.
   */
  private List offsetsOfSubstring(final int startIndexInclusive,
      final int endIndexExclusive) {
    checkArgument(startIndexInclusive < endIndexExclusive,
        String.format("Start Index %d not less than end index %s", startIndexInclusive,
            endIndexExclusive));

    final ImmutableList.Builder ret = ImmutableList.builder();

    for (int entryNum = lastEntryStartingBefore(startIndexInclusive); entryNum < offsets.size(); ++entryNum) {
      final OffsetEntry entry = offsets.get(entryNum);
      checkArgument(entry.startPos <= endIndexExclusive);

      int newStartPos = entry.startPos;
      int newEndPos = entry.endPos;
      OffsetGroup newStartOffset = entry.startOffset;
      OffsetGroup newEndOffset = entry.endOffset;

      final int charLength =
          entry.endOffset.charOffset().value() - entry.startOffset.charOffset().value();
      final int edtLength =
          entry.endOffset.edtOffset().value() - entry.startOffset.edtOffset().value();

			/* DK: Recalculation of EDT offset assumes that within this OffsetEntry, there is no longer any difference
			 * between edt chars and actual chars.  The checkArgument call makes this assumption explicit, allowing the
			 * calculation of a new, correct edt offset.
			 *
			 * A special case is when the EDT offset length of this entry is 0, for example, if this entry is a URL in angle brackets.
			 * In this case, the edt offsets remain the same.
			 */

      if (entry.startPos < startIndexInclusive) {
        newStartPos = startIndexInclusive;

        checkArgument(charLength == edtLength || edtLength == 0);
        int newEDTOffsetValue = entry.startOffset.edtOffset().value();
        if (edtLength != 0) {
          newEDTOffsetValue += (startIndexInclusive - entry.startPos);
        }
        newStartOffset =
            OffsetGroup.from(new CharOffset(startIndexInclusive), new EDTOffset(newEDTOffsetValue));
      }
      if (entry.endPos > endIndexExclusive) {
        newEndPos = endIndexExclusive;

        checkArgument(charLength == edtLength || edtLength == 0);
        int newEDTOffsetValue = entry.endOffset.edtOffset().value();
        if (edtLength != 0) {
          newEDTOffsetValue -= (entry.endPos - endIndexExclusive);
        }
        newEndOffset = OffsetGroup
            .from(new CharOffset(endIndexExclusive - 1), new EDTOffset(newEDTOffsetValue));
      }
      newStartPos -= startIndexInclusive;
      newEndPos -= startIndexInclusive;
      ret.add(new OffsetEntry(newStartPos, newEndPos, newStartOffset, newEndOffset,
          entry.isEDTSkipRegion));

      if (newEndPos >= (endIndexExclusive - startIndexInclusive)) {
        break;
      }
    }

    return ret.build();
  }

  private int lastEntryStartingBefore(final int pos) {
    int i = 1;
    while (i < offsets.size() && offsets.get(i).startPos <= pos) {
      ++i;
    }
    return i - 1;
  }

  @Override
  public String toString() {
    return Objects.toStringHelper(this).add("bounds", bounds).add("content", content).toString();
  }


}