All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikimedia.highlighter.cirrus.lucene.hit.AutomatonHitEnum Maven / Gradle / Ivy

The newest version!
package org.wikimedia.highlighter.cirrus.lucene.hit;

import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.RegExp;
import org.apache.lucene.util.automaton.Transition;
import org.wikimedia.highlighter.cirrus.lucene.automaton.AcceptReturningReverseRunAutomaton;
import org.wikimedia.highlighter.cirrus.lucene.automaton.OffsetReturningRunAutomaton;
import org.wikimedia.search.highlighter.cirrus.hit.AbstractHitEnum;
import org.wikimedia.search.highlighter.cirrus.hit.HitWeigher;
import org.wikimedia.search.highlighter.cirrus.hit.weight.ConstantHitWeigher;

/**
 * HitEnum implementation that slides a Lucene automaton across the source,
 * matching whatever matches. Does not support overlapping matches.
 */
public abstract class AutomatonHitEnum extends AbstractHitEnum {
    public static Factory factory(String regex, int maxDeterminizedStates) {
        return new Factory(regex, maxDeterminizedStates);
    }

    public static final class Factory {
        private final OffsetReturningRunAutomaton forward;
        private final AcceptReturningReverseRunAutomaton reverse;
        private BitSet startPositions;

        private Factory(String regexString, int maxDeterminizedStates) {
            Automaton automaton = new RegExp(regexString).toAutomaton(maxDeterminizedStates);
            forward = new OffsetReturningRunAutomaton(automaton, false);
            if (hasLeadingWildcard(automaton)) {
                Automaton reversed = Operations.determinize(Operations.reverse(
                        new RegExp("(" + regexString + ").*").toAutomaton(maxDeterminizedStates)), maxDeterminizedStates);
                reverse = new AcceptReturningReverseRunAutomaton(reversed);
            } else {
                reverse = null;
            }
        }

        /**
         * Build the HitEnum so all hits have equal weight.
         */
        public AutomatonHitEnum build(String source) {
            return build(source, ConstantHitWeigher.ONE, ConstantHitWeigher.ONE);
        }

        public AutomatonHitEnum build(String source, HitWeigher queryWeigher,
                HitWeigher corpusWeigher) {
            if (reverse == null) {
                return new AutomatonHitEnum.Forward(forward, source, queryWeigher, corpusWeigher);
            } else {
                startPositions = reverse.run(source, startPositions);
                return new AutomatonHitEnum.TwoPass(forward, startPositions, source, queryWeigher, corpusWeigher);
            }
        }
    }

    protected final OffsetReturningRunAutomaton runAutomaton;
    protected final String source;
    protected final HitWeigher queryWeigher;
    protected final HitWeigher corpusWeigher;
    protected final int length;
    protected int start;
    protected int end;
    protected float queryWeight;
    protected float corpusWeight;
    protected int position = -1;

    public AutomatonHitEnum(OffsetReturningRunAutomaton runAutomaton, String source, HitWeigher queryWeigher, HitWeigher corpusWeigher) {
        this.runAutomaton = runAutomaton;
        this.source = source;
        this.length = source.length();
        this.queryWeigher = queryWeigher;
        this.corpusWeigher = corpusWeigher;
    }

    @Override
    public int position() {
        return position;
    }

    @Override
    public int startOffset() {
        return start;
    }

    @Override
    public int endOffset() {
        return end;
    }

    @Override
    public float queryWeight() {
        return queryWeight;
    }

    @Override
    public float corpusWeight() {
        return corpusWeight;
    }

    @Override
    public int source() {
        // We punt here and hope someone will override this behavior
        // because we really can't trace the hit to a useful source.
        return 0;
    }

    @Override
    public String toString() {
        return runAutomaton.toString();
    }

    public static class Forward extends AutomatonHitEnum {
        public Forward(OffsetReturningRunAutomaton runAutomaton, String source,
                       HitWeigher queryWeigher, HitWeigher corpusWeigher) {
            super(runAutomaton, source, queryWeigher, corpusWeigher);
        }

        @Override
        public boolean next() {
            // Start looking where the last hit stopped
            start = end;

            // Look until there aren't any more characters
            while (start < length) {
                end = runAutomaton.run(source, start, length);
                if (end >= 0) {
                    // Found a match!
                    position++;
                    queryWeight = queryWeigher.weight(position, start, end);
                    corpusWeight = corpusWeigher.weight(position, start, end);
                    return true;
                }
                // No match, push start and keep checking
                start += Character.charCount(source.codePointAt(start));
            }

            // No matches at all, set end to length so we never check again
            end = length;
            return false;
        }

        @Override
        public String toString() {
            return runAutomaton.toString();
        }
    }

    /**
     * The forward algorithm, above, when presented with a regex like '.*foo' has
     * a very expensive failure case when provided a string that does not match the
     * regex (such as the tail of a document after the initial match). The forward
     * implementation requires n^2 state transitions to verify none of the possible
     * initial positions match.
     *
     * Avoid this by first performing a backwards pass marking all valid start positions
     * of the regex. The forward pass can then lookup the next valid start position and
     * return a match directly. In this way the source is only scanned once for each pass
     * at the cost of allocating a bitset.
     */
    static class TwoPass extends AutomatonHitEnum {
        private final BitSet startPositions;

        TwoPass(OffsetReturningRunAutomaton forward, BitSet startPositions, String source,
                                HitWeigher queryWeigher, HitWeigher corpusWeigher) {
            super(forward, source, queryWeigher, corpusWeigher);
            this.startPositions = startPositions;
        }

        @Override
        public boolean next() {
            if (end >= length) {
                return false;
            }
            // Start looking where the last hit stopped.
            start = startPositions.nextSetBit(end);
            if (start == DocIdSetIterator.NO_MORE_DOCS) {
                // No matches remain. set end to length so we never check again.
                end = length;
                return false;
            }

            // Found a match!
            // Run the forward pass to find the end of the match
            end = runAutomaton.run(source, start, length);
            if (end < 0) {
                throw new RuntimeException("Unreachable");
            }
            position++;
            queryWeight = queryWeigher.weight(position, start, end);
            corpusWeight = corpusWeigher.weight(position, start, end);
            return true;
        }
    }

    static boolean hasLeadingWildcard(Automaton a) {
        // catches [a-z]*
        if (isStateUnconstrainedWildcard(a, 0)) {
            return true;
        }
        // catches [a-z]+
        Transition t = new Transition();
        int max = a.initTransition(0, t);
        boolean[] seen = new boolean[a.getNumStates()];
        seen[0] = true; // 0 was checked above.
        for (int i = 0; i < max; i++) {
            a.getNextTransition(t);
            if (!seen[t.dest]) {
                if (isStateUnconstrainedWildcard(a, t.dest)) {
                    return true;
                }
                seen[t.dest] = true;
            }
        }
        return false;
    }

    /**
     * @param a Automaton to check
     * @param state State within the automaton to check
     * @return True when the provided state loops back to itself
     *  with at least 15 distinct code points. Complete hack,
     *  but seems to catch .* and similar constructs.
     */
    static boolean isStateUnconstrainedWildcard(Automaton a, int state) {
        Transition t = new Transition();
        int returnToState = 0;
        int max = a.initTransition(state, t);
        for (int i = 0; i < max; i++) {
            a.getNextTransition(t);
            if (t.dest == state) {
                returnToState += t.max - t.min;
            }
        }
        return returnToState > 15;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy