com.cobber.fta.token.CharClassToken Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fta-core Show documentation
Analyze Text data to determine simple type and Semantic type information as well as other key metrics associated with a text stream.
There is a newer version: 15.10.1
Show newest version
/*
 * Copyright 2017-2024 Tim Segall
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cobber.fta.token;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

import com.cobber.fta.core.RegExpSplitter;

/**
 * A CharClassToken captures a Regular Expression Character Class.
 * The performance of this class and merge() in particular are critical to the overall performance of FTA.
 * We split the tracking into ASCII and non-ASCII characters to improve performance.
 */
public class CharClassToken extends Token {
	private int countASCII;
	private final boolean[] seenASCII = new boolean[128];
	private int lowASCII = Integer.MAX_VALUE;
	private int highASCII = Integer.MIN_VALUE;
	private int maxSetASCII = -1;
	private int countNonASCII;
	private int minObserved = 1;
	private int maxObserved = 1;
	private Set seenNonASCII = new TreeSet<>();
	private List children;

	public CharClassToken(final Token.Type type, final char ch) {
		super(type);

		if (ch < 128) {
			seenASCII[ch] = true;
			lowASCII = highASCII = ch;
			countASCII++;
		}
		else {
			seenNonASCII.add(ch);
			countNonASCII = 1;
		}

		if (type.equals(Token.Type.DIGIT_CLASS))
			maxSetASCII = 10;								// 0-9
		else if (type.equals(Token.Type.ALPHA_CLASS))
			maxSetASCII = 52;								// a-zA-Z
		else if (type.equals(Token.Type.ALPHADIGIT_CLASS))
			maxSetASCII = 62;								// 0-9a-zA-Z
	}

	private CharClassToken(final Token.Type type) {
		super(type);
	}

	@Override
	public CharClassToken newCopy() {
		final CharClassToken ret = new CharClassToken(this.type);

		ret.countASCII = this.countASCII;
		ret.lowASCII = this.lowASCII;
		ret.highASCII = this.highASCII;
		if (ret.lowASCII != Integer.MAX_VALUE)
			System.arraycopy(seenASCII, this.lowASCII, ret.seenASCII, this.lowASCII, (this.highASCII - this.lowASCII) + 1);
		ret.countNonASCII = this.countNonASCII;
		ret.seenNonASCII = new TreeSet<>(this.seenNonASCII);
		ret.maxSetASCII = this.maxSetASCII;
		ret.minObserved = this.minObserved;
		ret.maxObserved = this.maxObserved;

		return ret;
	}

	@Override
	public int charactersUsed() {
		return countASCII + countNonASCII;
	}

	private Set getFullSet() {
		final Set ret = new TreeSet<>(seenNonASCII);

		if (countASCII != 0)
			for (int i = lowASCII; i <= highASCII; i++)
				if (seenASCII[i])
					ret.add((char)i);

		return ret;
	}

	/**
	 * Get the set of Ranges (contiguous low to high characters) for this Character Class.
	 * For example with inputs 1, 2, 3, 6, 8, 9 - three ranges would be returned 1-3, 6-6, and 8-9.
	 * @return A set of Ranges (low-high characters) that represent all the characters in this Character Class.
	 */
	public Set getRanges() {
		final Set ranges = new TreeSet<>();
		Range range = null;
		char last = '¶';

		for (final char ch : getFullSet()) {
			if (range == null) {
				range = new Range(ch);
				last = ch;
			}
			else if (ch == last + 1)
				last = ch;
			else {
				range.setMax(last);
				ranges.add(range);
				range = new Range(ch);
				last = ch;
			}
		}
		if (range != null) {
			range.setMax(last);
			ranges.add(range);
		}

		return ranges;
	}

	private String getSimpleRegExp(final boolean enumerateRanges) {
		final Set chars = getFullSet();
		if (chars.size() == 1)
			return String.valueOf(chars.iterator().next());
		if (!enumerateRanges)
			return type.getRegExp();

		String ret = "[";
		for (final Range range : getRanges())
			ret += range.toString();
		ret += "]";

		return ret;
	}

	@Override
	public String getRegExp(final boolean fitted) {
		if (!fitted)
			return type.getRegExp() + RegExpSplitter.qualify(minObserved, maxObserved);

		final StringBuilder b = new StringBuilder();

		CharClassToken lastToken = null;
		final List kids = children == null ? Collections.singletonList(this) : children;
		boolean enumerateRanges = false;
		// Coalesce multiple numerics or alphas into one
		for (final CharClassToken token : kids) {
			enumerateRanges = Token.Type.DIGIT_CLASS.equals(token.type) && token.countASCII != token.maxSetASCII;
			if (lastToken == null) {
				lastToken = token.newCopy();
				continue;
			}

			if (token.getSimpleRegExp(enumerateRanges).equals(lastToken.getSimpleRegExp(enumerateRanges)))
				lastToken.coalesce(token);
			else {
				b.append(lastToken.getSimpleRegExp(enumerateRanges) + RegExpSplitter.qualify(lastToken.minObserved, lastToken.maxObserved));
				lastToken = token.newCopy();
			}
        }
		b.append(lastToken.getSimpleRegExp(enumerateRanges)).append(RegExpSplitter.qualify(lastToken.minObserved, lastToken.maxObserved));

		return b.toString();
	}

	@Override
	public Token merge(final Token o) {
		final CharClassToken other = (CharClassToken)o;

		children = null;

		mergeObservations(other);

		this.minObserved = Math.min(this.minObserved, other.minObserved);
		this.maxObserved = Math.max(this.maxObserved, other.maxObserved);

		return this;
	}

	/**
	 * Coalesce is used to merge to adjacent tokens into one in a single TokenStream, this is in contrast to merge which is
	 * used to merge two tokens in a similar position in different TokenStreams.
	 * @param other The other Token to be coalesced.
	 * @return The coalesced token.
	 */
	public CharClassToken coalesce(final CharClassToken other) {
		if (children == null) {
			children = new ArrayList<>();
			children.add(this.newCopy());
		}
		children.add(other);

		mergeObservations(other);

		this.minObserved += other.minObserved;
		this.maxObserved += other.maxObserved;

		return this;
	}

	private void mergeObservations(final CharClassToken other) {
		// The only differing types we are prepared to merge is anything to ALPHADIGIT
		if (!type.equals(other.type) && !type.equals(Token.Type.ALPHADIGIT_CLASS)) {
			type = Token.Type.ALPHADIGIT_CLASS;
			maxSetASCII = 62;
		}

		// No need to merge the ASCII observations if we have seen the entire set or no observations from other set
		if (maxSetASCII != countASCII && other.countASCII != 0) {
			for (int i = other.lowASCII; i <= other.highASCII; i++)
				if (other.seenASCII[i] && !seenASCII[i]) {
					seenASCII[i] = true;
					countASCII++;
				}
			lowASCII = Math.min(lowASCII, other.lowASCII);
			highASCII = Math.max(highASCII, other.highASCII);
		}

		if (other.countNonASCII != 0) {
			seenNonASCII.addAll(other.seenNonASCII);
			countNonASCII =  seenNonASCII.size();
		}
	}
}