All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.UnicodeFriendlyMatcher Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp;

import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.base.Optional;
import edu.isi.nlp.strings.offsets.CharOffset;
import edu.isi.nlp.strings.offsets.OffsetRange;
import edu.isi.nlp.strings.offsets.UTF16Offset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/** Created by rgabbard on 7/7/17. */
public final class UnicodeFriendlyMatcher {

  private final Matcher wrappedMatcher;
  private final UnicodeFriendlyString matchedString;

  private UnicodeFriendlyMatcher(
      final Matcher wrappedMatcher, UnicodeFriendlyString matchedString) {
    this.wrappedMatcher = checkNotNull(wrappedMatcher);
    this.matchedString = checkNotNull(matchedString);
  }

  public static UnicodeFriendlyMatcher match(Pattern pattern, UnicodeFriendlyString ufs) {
    return new UnicodeFriendlyMatcher(pattern.matcher(ufs.utf16CodeUnits()), ufs);
  }

  public boolean find() {
    return wrappedMatcher.find();
  }

  public boolean matches() {
    return wrappedMatcher.matches();
  }

  public OffsetRange matchOffsetsInclusive() {
    return OffsetRange.fromInclusiveEndpoints(start(), endExclusive().shiftedCopy(-1));
  }

  public Optional> matchOffsetsInclusive(int group) {
    final boolean successfulMatchWithNoText =
        wrappedMatcher.start(group) == wrappedMatcher.end(group);

    if (successfulMatchWithNoText) {
      return Optional.absent();
    }

    final Optional startGroupCodepoint = start(group);
    final Optional endGroupCodepointExclusive = endExclusive(group);

    if (startGroupCodepoint.isPresent() && endGroupCodepointExclusive.isPresent()) {
      return Optional.of(
          OffsetRange.fromInclusiveEndpoints(
              startGroupCodepoint.get(), endGroupCodepointExclusive.get().shiftedCopy(-1)));
    } else {
      return Optional.absent();
    }
  }

  public CharOffset start() {
    return startMatchedCodepointForCodeUnit(wrappedMatcher.start());
  }

  public Optional start(int group) {
    final int startCodeUnit = wrappedMatcher.start(group);
    if (startCodeUnit >= 0) {
      return Optional.of(startMatchedCodepointForCodeUnit(startCodeUnit));
    } else {
      return Optional.absent();
    }
  }

  public CharOffset endExclusive() {
    return endMatchedCodepointForCodeUnit(wrappedMatcher.end());
  }

  public Optional endExclusive(int group) {
    final int endCodeUnit = wrappedMatcher.end(group);

    final boolean successfulMatchWithNoText = wrappedMatcher.start(group) == endCodeUnit;

    if (successfulMatchWithNoText) {
      // TODO: consider if this is the right thing to do here. Issue #70
      return Optional.absent();
    }

    if (endCodeUnit >= 0) {
      return Optional.of(endMatchedCodepointForCodeUnit(endCodeUnit));
    } else {
      return Optional.absent();
    }
  }

  public Optional group(int groupIndex) {
    final Optional> offsets = matchOffsetsInclusive(groupIndex);
    if (offsets.isPresent()) {
      return Optional.of(matchedString.substringByCodePoints(offsets.get()));
    } else {
      return Optional.absent();
    }
  }

  public int groupCount() {
    return wrappedMatcher.groupCount();
  }

  private CharOffset startMatchedCodepointForCodeUnit(int codeunit) {
    return matchedString.codepointIndex(UTF16Offset.of(codeunit));
  }

  private CharOffset endMatchedCodepointForCodeUnit(int codeunit) {
    // the inner shift is necessary because wrappedMatcher.end() returns the index *past* the
    // end of the match. Since this might be out-of-bounds for the string, we might be unable to
    // map it to a code point offset. So we instead map the preceding character and then shift
    // the resulting codepoint by one to return an exclusive end offset
    return matchedString.codepointIndex(UTF16Offset.of(codeunit - 1)).shiftedCopy(1);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy