All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bbn.bue.common.strings.LocatedString Maven / Gradle / Ivy

There is a newer version: 4.1.2
Show newest version
package com.bbn.bue.common.strings;

import com.bbn.bue.common.strings.offsets.ASRTime;
import com.bbn.bue.common.strings.offsets.ByteOffset;
import com.bbn.bue.common.strings.offsets.CharOffset;
import com.bbn.bue.common.strings.offsets.EDTOffset;
import com.bbn.bue.common.strings.offsets.OffsetGroup;
import com.bbn.bue.common.strings.offsets.OffsetGroupRange;
import com.bbn.bue.common.strings.offsets.OffsetRange;

import com.google.common.base.Objects;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableList;

import java.util.List;
import java.util.NoSuchElementException;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

/**
 * * Class for storing and manipulating strings that have been read in from a file, without losing
 * the relationship between each character and its origin in the file from which it was read.  In
 * particular, for each character in the located string, we record a start offset and an end offset
 * of each offset type (ByteOffset, CharOffset, EDTOffset, and ASRTime).  Start offsets and end
 * offsets are zero- indexed, and both are inclusive.  E.g., if a character in the string came from
 * a single byte at position 12, then that character's start ByteOffset and end ByteOffset will both
 * be 12.  For a character that was encoded using three bytes at positions 14, 15, and 16, the start
 * ByteOffset will be 14, and the end ByteOffset will be 16.
 *
 * In unmodified LocatedStrings, the start CharOffset for each character will be equal to its end
 * CharOffset.  However, modifications that replace substrings can result in individual characters
 * whose start and end offsets are not equal, since the offsets of the replacement characters are
 * set based on the entire range of characters in the replaced substring.
 *
 * The four offset types that are currently stored for each character are:
 *
 * - CharOffset.  More accurately, this is a unicode code point offset.
 *
 * - ByteOffset.  Currently, we assume that the source string was UTF-8, and calculate byte offsets
 * by checking how many bytes it would take to encode each character.  In the future, if we did our
 * own unicode encoding, we could directly read byte offsets for other encodings.
 *
 * - EDTOffset.  EDT offsets are similra to character offsets, except that (i) any substrings
 * starting with "<" and extending to the matching ">" are skipped when counting offsets; and (ii)
 * the character "\r" is skipped when counting offsets. Note that condition (i) is *not* always
 * identical to skipping XML/SGML tags and comments.
 *
 * - ASRTime.  The start and end time of the speech signal that corresponds to a character.  ASRTime
 * must be set after a LocatedString is constructed, using setAsrStartTime() and setAsrEndTime().
 *
 * @author originally by David A. Herman, refactored by Edward Loper; translated to Java by Ryan
 *         Gabbard
 * @author rgabbard
 */
public final class LocatedString {

  public String text() {
    return content;
  }

  public int length() {
    return content.length();
  }

  /**
   * ****************************************************************** Offset accessors
   * *******************************************************************
   */
  public EDTOffset startEDTOffset() {
    return bounds.startInclusive().edtOffset();
  }

  public EDTOffset endEDTOffset() {
    return bounds.endInclusive().edtOffset();
  }

  public CharOffset startCharOffset() {
    return bounds.startInclusive().charOffset();
  }

  public CharOffset endCharOffset() {
    return bounds.endInclusive().charOffset();
  }

  public OffsetGroupRange bounds() {
    return bounds;
  }

  public List offsetEntries() {
    return offsets;
  }

  public static LocatedString forString(final String text) {
    final OffsetGroup initialOffsets = OffsetGroup.from(new ByteOffset(0), new CharOffset(0),
        EDTOffset.asEDTOffset(0));
    return forString(text, initialOffsets);
  }

  public static LocatedString forString(final String text, final OffsetGroupRange bounds,
      final List spanOffsets) {
    return new LocatedString(text, spanOffsets, bounds);
  }

  public static LocatedString forString(final String text, final OffsetGroup initialOffsets) {
    return forString(text, initialOffsets, false);
  }

  public static LocatedString forString(final String text, final OffsetGroup initialOffsets,
      final boolean EDTOffsetsAreCharOffsets) {
    final List offsets =
        calculateOffsets(text, initialOffsets, EDTOffsetsAreCharOffsets);
    final OffsetGroupRange bounds = boundsFromOffsets(offsets);
    return new LocatedString(text, offsets, bounds);
  }

  /**
   * Return a LocatedString substring of this string.
   *
   * NOTE: Because it recomputes the various offsets of every character in the
   * substring, this method is *significantly* more expensive than just
   * fetching the String content of the substring.  If you just need the String
   * content, you should use rawSubstring() instead.
   */
  public LocatedString substring(final OffsetGroup start, final OffsetGroup end) {
    return substring(start.charOffset(), end.charOffset());
  }

  /**
   * Return a LocatedString substring of this string.
   *
   * NOTE: Because it recomputes the various offsets of every character in the
   * substring, this method is *significantly* more expensive than just
   * fetching the String content of the substring.  If you just need the String
   * content, you should use rawSubstring() instead.
   */
  public LocatedString substring(final CharOffset start, final CharOffset end) {
    final int startOffset = start.value() - bounds.startInclusive().charOffset().value();
    final int endOffset = end.value() - bounds.startInclusive().charOffset().value() + 1;

    return substring(startOffset, endOffset);
  }

	/*public LocatedString substringConvertP(OffsetGroup start, OffsetGroup end) {
                final int startOffset = start.charOffset().value() - bounds.start.charOffset().value();
		final int endOffset = end.charOffset().value() - bounds.start.charOffset().value() + 1;

		return substringConvertP(startOffset, endOffset);
	}

	public LocatedString substringConvertP(int startIndexInclusive, int endIndexExclusive) {
		final String text = content.substring(startIndexInclusive, endIndexExclusive);
		final String text2 = text.replace("

", ""); final List offsets = offsetsOfSubstring(startIndexInclusive, endIndexExclusive); final OffsetRange bounds = boundsFromOffsets(offsets); System.out.println(new LocatedString(text2, offsets, bounds)); return new LocatedString(text2, offsets, bounds); }*/ /** * Return a LocatedString substring of this string. * * NOTE: Because it recomputes the various offsets of every character in the * substring, this method is *significantly* more expensive than just * fetching the String content of the substring. If you just need the String * content, you should use rawSubstring() instead. */ public LocatedString substring(final int startIndexInclusive, final int endIndexExclusive) { final String text = content.substring(startIndexInclusive, endIndexExclusive); final List offsets = offsetsOfSubstring(startIndexInclusive, endIndexExclusive); final OffsetGroupRange bounds = boundsFromOffsets(offsets); return new LocatedString(text, offsets, bounds); } /** * Return a String substring of this string. * * @param start * @param end * @return */ public String rawSubstring(final OffsetGroup start, final OffsetGroup end) { return rawSubstring(start.charOffset(), end.charOffset()); } /** * Return a String substring of this string. * * @param start * @param end * @return */ public String rawSubstring(final CharOffset start, final CharOffset end) { final int startOffset = start.value() - bounds.startInclusive().charOffset().value(); final int endOffset = end.value() - bounds.startInclusive().charOffset().value() + 1; return rawSubstring(startOffset, endOffset); } /** * Return a String substring of this string. * * @param startIndexInclusive * @param endIndexExclusive * @return */ public String rawSubstring(final int startIndexInclusive, final int endIndexExclusive) { return content.substring(startIndexInclusive, endIndexExclusive); } /** * Returns the earliest offset group within this {@code LocatedString} whose character offset * matches the one supplied. If not such offset group exists, throws a {@link * NoSuchElementException}. */ public OffsetGroup offsetGroupForCharOffset(final CharOffset offset) { // if this ever slows us down significantly, we can binary search for (final OffsetEntry entry : offsets) { if (entry.startOffset.charOffset().value() <= offset.value() && entry.endOffset.charOffset().value() > offset.value()) { // we assume EDT offsets are continuous witihn entries final int offsetWithinEntry = offset.value() - entry.startOffset.charOffset().value(); return OffsetGroup .from(offset, new EDTOffset(entry.startOffset.edtOffset().value() + offsetWithinEntry)); } } throw new NoSuchElementException(); } public boolean contains(LocatedString other) { // TODO: we do it this way because the C++ is implemented this way, // so implementing isSubstringOf is an easy, less error-prone // translation. But .contains() is more idiomatic Java. return other.isSubstringOf(this); } /** * finds the position of the first offset entry of this object which has an identical char offset to oe * * preserves the CPP interface, more or less * @param charOffset * @return */ private int positionOfStartOffsetChar(final CharOffset charOffset) { for(final OffsetEntry it: offsetEntries()) { if(it.startOffset().charOffset().asInt() > charOffset.asInt()) { return -1; } if(charOffset.asInt() <= it.endOffset().charOffset().asInt()) { return it.startPos() + (charOffset.asInt() - it.startOffset().charOffset().asInt()); } } return -1; } private CharOffset getStartOffset(int pos) { final OffsetEntry oe = offsetEntries().get(lastEntryStartingBefore(pos)); checkArgument(pos >= oe.startPos() && pos <= oe.endPos() - 1); if(pos == oe.startPos()) { return oe.startOffset().charOffset(); } else { return CharOffset.asCharOffset(oe.startOffset().charOffset().asInt() + (pos - oe.startPos())); } } private CharOffset getEndOffset(int pos) { final OffsetEntry oe = offsetEntries().get(lastEntryStartingBefore(pos)); checkArgument(pos >= oe.startPos() && pos <= oe.endPos()); if(pos == oe.endPos() -1) { return oe.endOffset().charOffset(); } else { return CharOffset.asCharOffset(oe.startOffset().charOffset().asInt() + (pos - oe.startPos())); } } private boolean isSubstringOf(LocatedString sup) { final int superStringStartPos = sup.positionOfStartOffsetChar(offsetEntries().get(0).startOffset().charOffset()); if (superStringStartPos < 0) { return false; } if (superStringStartPos + length() > sup.length()) { return false; } final OffsetRange thisCharOffsets = this.bounds().asCharOffsetRange(); if (thisCharOffsets.startInclusive().asInt() != sup.getStartOffset(superStringStartPos).asInt()) { return false; } if (thisCharOffsets.endInclusive().asInt() != sup.getEndOffset(superStringStartPos + this.length()).asInt()-1) { return false; } //TODO: if this is slow, do a point by point comparison instead of substring if (!sup.content.substring(superStringStartPos, superStringStartPos + this.length()).equals( content)) { return false; } return true; } /** * ***************************************************************************** Private * implementation */ private final String content; private final OffsetGroupRange bounds; private final List offsets; private LocatedString(final String content, final List offsets, final OffsetGroupRange bounds) { this.content = content; this.bounds = bounds; // since this is a private constructor, no need to defensively copy to preserve immutability this.offsets = offsets; } @Override public int hashCode() { return Objects.hashCode(content, bounds, offsets); } @Override /** * Equality for this is quite strict - it must be exactly the same string and offsets * with exactly the same interior material omitted, if any. */ public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null || getClass() != obj.getClass()) { return false; } final LocatedString other = (LocatedString) obj; return Objects.equal(this.bounds, other.bounds) && Objects.equal(this.content, other.content) && Objects.equal(this.offsets, other.offsets); } public static class OffsetEntry { private final int startPos; private final int endPos; private final OffsetGroup startOffset; private final OffsetGroup endOffset; private final boolean isEDTSkipRegion; public OffsetEntry(final int startPos, final int endPos, final OffsetGroup startOffset, final OffsetGroup endOffset, final boolean isEDTSkipRegion) { this.startPos = startPos; this.endPos = endPos; this.startOffset = startOffset; this.endOffset = endOffset; this.isEDTSkipRegion = isEDTSkipRegion; } public int startPos() { return startPos; } public int endPos() { return endPos; } public OffsetGroup startOffset() { return startOffset; } public OffsetGroup endOffset() { return endOffset; } public boolean isEDTSkipRegion() { return isEDTSkipRegion; } @Override public String toString() { return "start: " + startPos + "\nend: " + endPos + "\nstartOffset: " + startOffset + "\nendOffset: " + endOffset + "\nEDTSkip: " + isEDTSkipRegion; } @Override public int hashCode() { return Objects.hashCode(startPos, endPos, startOffset, endOffset, isEDTSkipRegion); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null || getClass() != obj.getClass()) { return false; } final OffsetEntry other = (OffsetEntry) obj; return Objects.equal(this.startPos, other.startPos) && Objects .equal(this.endPos, other.endPos) && Objects.equal(this.startOffset, other.startOffset) && Objects.equal(this.endOffset, other.endOffset) && Objects .equal(this.isEDTSkipRegion, other.isEDTSkipRegion); } } private static List calculateOffsets(final String text, final OffsetGroup initialOffsets, final boolean EDTOffsetsAreCharOffsets) { checkNotNull(text); checkNotNull(initialOffsets); final ImmutableList.Builder offsets = ImmutableList.builder(); final Optional weDontKnowASRTime = Optional.absent(); int inTag = 0; boolean useByteOffsets = initialOffsets.byteOffset().isPresent(); int byteOffset = useByteOffsets ? initialOffsets.byteOffset().get().value() : Integer.MIN_VALUE; int charOffset = initialOffsets.charOffset().value(); int edtOffset = initialOffsets.edtOffset().value(); int pos = 0; int startPos = 0; boolean justLeftXMLTag = false; char prevChar = 0; OffsetGroup start = initialOffsets; // TODO: figure out how this works with UTF-16 unicode encoding... for (; pos < text.length(); ++pos) { final char c = text.charAt(pos); if (!EDTOffsetsAreCharOffsets && pos > 0 && (inTag == 0 && (c == '<' || prevChar == '\r') || justLeftXMLTag) && !(justLeftXMLTag && c == '<')) { final int prevEDTOffset = (edtOffset == 0 || prevChar == '\r') ? edtOffset : (edtOffset - 1); offsets.add( new OffsetEntry(startPos, pos, start, OffsetGroup.from(useByteOffsets ? new ByteOffset(byteOffset - 1) : null, new CharOffset(charOffset - 1), EDTOffset.asEDTOffset(prevEDTOffset)), justLeftXMLTag)); startPos = pos; final int startEDTOffset = (c == '<') ? edtOffset - 1 : edtOffset; start = OffsetGroup .from(useByteOffsets ? new ByteOffset(byteOffset) : null, new CharOffset(charOffset), EDTOffset.asEDTOffset(startEDTOffset)); } ++charOffset; byteOffset += UTF8BytesInChar(c); if (EDTOffsetsAreCharOffsets || (!(inTag != 0 || c == '<' || c == '\r'))) { ++edtOffset; } if (!EDTOffsetsAreCharOffsets) { justLeftXMLTag = false; if (c == '<') { ++inTag; } else if (inTag > 0 && c == '>') { --inTag; if (inTag == 0) { justLeftXMLTag = true; } } } prevChar = c; } if (pos > startPos) { final int prevEDTOffset = Math.max(start.edtOffset().value(), edtOffset - 1); offsets.add(new OffsetEntry(startPos, pos, start, OffsetGroup.from(useByteOffsets ? new ByteOffset(byteOffset - 1) : null, new CharOffset(charOffset - 1), EDTOffset.asEDTOffset(prevEDTOffset)), inTag > 0 || justLeftXMLTag)); } return offsets.build(); } private static OffsetGroupRange boundsFromOffsets(final List offsets) { checkArgument(!offsets.isEmpty()); return OffsetGroupRange .from(offsets.get(0).startOffset, offsets.get(offsets.size() - 1).endOffset); } private static final char ONE_BYTE = 0x007f; private static final char TWO_BYTE = 0x07ff; private static final char THREE_BYTE = 0xffff; private static final int UTF8BytesInChar(final char c) { if (c <= ONE_BYTE) { return 1; } else if (c <= TWO_BYTE) { return 2; } else if (c <= THREE_BYTE) { return 3; } else { return 4; } } /** * Returns offsets corresponding to substring, in order. */ private List offsetsOfSubstring(final int startIndexInclusive, final int endIndexExclusive) { checkArgument(startIndexInclusive < endIndexExclusive, String.format("Start Index %d not less than end index %s", startIndexInclusive, endIndexExclusive)); final ImmutableList.Builder ret = ImmutableList.builder(); for (int entryNum = lastEntryStartingBefore(startIndexInclusive); entryNum < offsets.size(); ++entryNum) { final OffsetEntry entry = offsets.get(entryNum); checkArgument(entry.startPos <= endIndexExclusive); int newStartPos = entry.startPos; int newEndPos = entry.endPos; OffsetGroup newStartOffset = entry.startOffset; OffsetGroup newEndOffset = entry.endOffset; final int charLength = entry.endOffset.charOffset().value() - entry.startOffset.charOffset().value(); final int edtLength = entry.endOffset.edtOffset().value() - entry.startOffset.edtOffset().value(); /* DK: Recalculation of EDT offset assumes that within this OffsetEntry, there is no longer any difference * between edt chars and actual chars. The checkArgument call makes this assumption explicit, allowing the * calculation of a new, correct edt offset. * * A special case is when the EDT offset length of this entry is 0, for example, if this entry is a URL in angle brackets. * In this case, the edt offsets remain the same. */ if (entry.startPos < startIndexInclusive) { newStartPos = startIndexInclusive; checkArgument(charLength == edtLength || edtLength == 0); int newEDTOffsetValue = entry.startOffset.edtOffset().value(); if (edtLength != 0) { newEDTOffsetValue += (startIndexInclusive - entry.startPos); } newStartOffset = OffsetGroup.from(new CharOffset(startIndexInclusive), new EDTOffset(newEDTOffsetValue)); } if (entry.endPos > endIndexExclusive) { newEndPos = endIndexExclusive; checkArgument(charLength == edtLength || edtLength == 0); int newEDTOffsetValue = entry.endOffset.edtOffset().value(); if (edtLength != 0) { newEDTOffsetValue -= (entry.endPos - endIndexExclusive); } newEndOffset = OffsetGroup .from(new CharOffset(endIndexExclusive - 1), new EDTOffset(newEDTOffsetValue)); } newStartPos -= startIndexInclusive; newEndPos -= startIndexInclusive; ret.add(new OffsetEntry(newStartPos, newEndPos, newStartOffset, newEndOffset, entry.isEDTSkipRegion)); if (newEndPos >= (endIndexExclusive - startIndexInclusive)) { break; } } return ret.build(); } private int lastEntryStartingBefore(final int pos) { int i = 1; while (i < offsets.size() && offsets.get(i).startPos <= pos) { ++i; } return i - 1; } @Override public String toString() { return Objects.toStringHelper(this).add("bounds", bounds).add("content", content).toString(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy