All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.isi.nlp.StringWithNonBmp Maven / Gradle / Ivy

The newest version!
package edu.isi.nlp;

import com.google.common.base.Optional;
import edu.isi.nlp.strings.LocatedString;
import edu.isi.nlp.strings.LocatedString.CharacterRegion;
import edu.isi.nlp.strings.offsets.CharOffset;
import edu.isi.nlp.strings.offsets.OffsetRange;
import edu.isi.nlp.strings.offsets.UTF16Offset;
import org.immutables.func.Functional;
import org.immutables.value.Value;

/**
 * A {@link UnicodeFriendlyString} which actually contains a non-BMP character. This class should
 * never be referenced directly in user code because it assumes the provided string has a non-BMP
 * character. {@link UnicodeFriendlyString}s should always be created via {@link
 * StringUtils#unicodeFriendly(String)}.
 *
 * 

See the interface Javadoc for details. Currently this uses simple but slow implementations for * most of its method, but much faster versions a can be added when needed by using a structure * similar to {@link LocatedString}'s {@link CharacterRegion}. */ @IsiNlpImmutable @Value.Immutable @Functional abstract class StringWithNonBmp extends AbstractUnicodeFriendlyString implements UnicodeFriendlyString { @Override public abstract String utf16CodeUnits(); @Override public final boolean hasNonBmpCharacter() { return true; } @Override public boolean hasNonBmpCharacter(OffsetRange characterRange) { // slow placeholder implementation for (int i = codeUnitOffsetFor(characterRange.startInclusive()).asInt(); i <= codeUnitOffsetFor(characterRange.endInclusive()).asInt(); ) { final int codePoint = utf16CodeUnits().codePointAt(i); final int charsForCodePoint = Character.charCount(codePoint); if (charsForCodePoint > 1) { return true; } else { i += charsForCodePoint; } } return false; } @Value.Derived @Override public int lengthInUtf16CodeUnits() { return utf16CodeUnits().length(); } @Value.Derived @Override public int lengthInCodePoints() { return utf16CodeUnits().codePointCount(0, utf16CodeUnits().length()); } @Override public int codepointAtCodepointIndex(final CharOffset codepointIdx) { return utf16CodeUnits().codePointAt(codeUnitOffsetFor(codepointIdx).asInt()); } static UnicodeFriendlyString of(String s) { return new Builder().utf16CodeUnits(s).build(); } @Override public final UnicodeFriendlyString substringByCodePoints(CharOffset startCodepointInclusive) { return substringByCodePoints( startCodepointInclusive, CharOffset.asCharOffset(lengthInCodePoints())); } @Override public final UnicodeFriendlyString substringByCodePoints( CharOffset startCodepointInclusive, CharOffset endCodepointExclusive) { // this is a slow, simple, temporary implementation final UTF16Offset startCodeUnitInclusive = codeUnitOffsetFor(startCodepointInclusive); final UTF16Offset endCodeUnitExclusive = codeUnitOffsetFor(endCodepointExclusive); // we need this because in the current implementation we can't tell if our substring // includes no non-BMP characters and should therefore use the other implementation return StringUtils.unicodeFriendly( utf16CodeUnits().substring(startCodeUnitInclusive.asInt(), endCodeUnitExclusive.asInt())); } // this is a slow temporary implementation. If performance becomes important, we can store // something like LocatedString's like CharacterRegions private UTF16Offset codeUnitOffsetFor(final CharOffset codePointOffset) { int charOffset = 0; int codePointsConsumed = 0; for (; charOffset < utf16CodeUnits().length() && codePointsConsumed < codePointOffset.asInt(); ++codePointsConsumed) { final int codePoint = utf16CodeUnits().codePointAt(charOffset); charOffset += Character.charCount(codePoint); } if (codePointsConsumed == codePointOffset.asInt()) { return UTF16Offset.of(charOffset); } else { // this will happen if codePointOffset is negative or equal to or greater than the // total number of codepoints in the string throw new IndexOutOfBoundsException(); } } @Override public CharOffset codepointIndex(UTF16Offset offset) { // slow placeholder implementation if (offset.asInt() < 0 || offset.asInt() >= utf16CodeUnits().length()) { throw new IndexOutOfBoundsException( "Valid UTF-16 code unit indices for string are 0 to " + utf16CodeUnits().length() + " but got " + offset.asInt()); } int charOffset = 0; int nextCodeUnit = 0; if (offset.asInt() == 0) { return CharOffset.asCharOffset(0); } while (nextCodeUnit <= offset.asInt()) { nextCodeUnit += Character.charCount(utf16CodeUnits().codePointAt(nextCodeUnit)); // catch when we "hop" over the code unit due to a multi-codeunit character if (nextCodeUnit > offset.asInt()) { return CharOffset.asCharOffset(charOffset); } ++charOffset; } throw new IllegalStateException("Should be impossible"); } @Override public boolean isEmpty() { return utf16CodeUnits().isEmpty(); } @Override public final UnicodeFriendlyString trim() { return StringWithNonBmp.of(utf16CodeUnits().trim()); } @Override public final Optional codePointIndexOf( UnicodeFriendlyString other, CharOffset startIndex) { if (startIndex.asInt() < 0 || startIndex.asInt() > lengthInCodePoints()) { throw new IndexOutOfBoundsException("StartIndex was out of bounds: " + startIndex); } final UTF16Offset offsetForStart = codeUnitOffsetFor(startIndex); final int matchingOffset = utf16CodeUnits().indexOf(other.utf16CodeUnits(), offsetForStart.asInt()); if (matchingOffset < 0) { return Optional.absent(); } else { final UTF16Offset utf16Offset = UTF16Offset.of(matchingOffset); final CharOffset charOffset = codepointIndex(utf16Offset); return Optional.of(charOffset); } } @Override public final void processCodePoints(CodePointProcessor codePointProcessor) { for (int codeUnitOffset = 0, codePointOffset = 0; codeUnitOffset < utf16CodeUnits().length(); ++codePointOffset) { final int codePoint = utf16CodeUnits().codePointAt(codeUnitOffset); codePointProcessor.processCodepoint( this, CharOffset.asCharOffset(codePointOffset), codePoint); codeUnitOffset += Character.charCount(codePoint); } } @Override public final String toString() { return super.toString(); } @Override public final int hashCode() { return super.hashCode(); } @Override public final boolean equals(Object o) { return super.equals(o); } static class Builder extends ImmutableStringWithNonBmp.Builder {} }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy