All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.AutomatonToTokenStream Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.Transition;

/** Converts an Automaton into a TokenStream. */
public class AutomatonToTokenStream {

  private AutomatonToTokenStream() {}

  /**
   * converts an automaton into a TokenStream. This is done by first Topo sorting the nodes in the
   * Automaton. Nodes that have the same distance from the start are grouped together to form the
   * position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton
   * as tokens in order from the position nodes. This requires the automaton be a finite DAG.
   *
   * @param automaton automaton to convert. Must be a finite DAG.
   * @return TokenStream representation of automaton.
   */
  public static TokenStream toTokenStream(Automaton automaton) {
    if (Operations.isFinite(automaton) == false) {
      throw new IllegalArgumentException("Automaton must be finite");
    }

    List> positionNodes = new ArrayList<>();

    Transition[][] transitions = automaton.getSortedTransitions();

    int[] indegree = new int[transitions.length];

    for (int i = 0; i < transitions.length; i++) {
      for (int edge = 0; edge < transitions[i].length; edge++) {
        indegree[transitions[i][edge].dest] += 1;
      }
    }
    if (indegree[0] != 0) {
      throw new IllegalArgumentException("Start node has incoming edges, creating cycle");
    }

    LinkedList noIncomingEdges = new LinkedList<>();
    Map idToPos = new HashMap<>();
    noIncomingEdges.addLast(new RemapNode(0, 0));
    while (noIncomingEdges.isEmpty() == false) {
      RemapNode currState = noIncomingEdges.removeFirst();
      for (int i = 0; i < transitions[currState.id].length; i++) {
        indegree[transitions[currState.id][i].dest] -= 1;
        if (indegree[transitions[currState.id][i].dest] == 0) {
          noIncomingEdges.addLast(
              new RemapNode(transitions[currState.id][i].dest, currState.pos + 1));
        }
      }
      if (positionNodes.size() == currState.pos) {
        List posIncs = new ArrayList<>();
        posIncs.add(currState.id);
        positionNodes.add(posIncs);
      } else {
        positionNodes.get(currState.pos).add(currState.id);
      }
      idToPos.put(currState.id, currState.pos);
    }

    for (int i = 0; i < indegree.length; i++) {
      if (indegree[i] != 0) {
        throw new IllegalArgumentException("Cycle found in automaton");
      }
    }

    List> edgesByLayer = new ArrayList<>();
    for (List layer : positionNodes) {
      List edges = new ArrayList<>();
      for (int state : layer) {
        for (Transition t : transitions[state]) {
          // each edge in the token stream can only be on value, though a transition takes a range.
          for (int val = t.min; val <= t.max; val++) {
            int destLayer = idToPos.get(t.dest);
            edges.add(new EdgeToken(destLayer, val));
            // If there's an intermediate accept state, add an edge to the terminal state.
            if (automaton.isAccept(t.dest) && destLayer != positionNodes.size() - 1) {
              edges.add(new EdgeToken(positionNodes.size() - 1, val));
            }
          }
        }
      }
      edgesByLayer.add(edges);
    }

    return new TopoTokenStream(edgesByLayer);
  }

  /** Token Stream that outputs tokens from a topo sorted graph. */
  private static class TopoTokenStream extends TokenStream {

    private final List> edgesByPos;
    private int currentPos;
    private int currentEdgeIndex;
    private CharTermAttribute charAttr = addAttribute(CharTermAttribute.class);
    private PositionIncrementAttribute incAttr = addAttribute(PositionIncrementAttribute.class);
    private PositionLengthAttribute lenAttr = addAttribute(PositionLengthAttribute.class);
    private OffsetAttribute offAttr = addAttribute(OffsetAttribute.class);

    public TopoTokenStream(List> edgesByPos) {
      this.edgesByPos = edgesByPos;
    }

    @Override
    public boolean incrementToken() throws IOException {
      clearAttributes();
      while (currentPos < edgesByPos.size()
          && currentEdgeIndex == edgesByPos.get(currentPos).size()) {
        currentEdgeIndex = 0;
        currentPos += 1;
      }
      if (currentPos == edgesByPos.size()) {
        return false;
      }
      EdgeToken currentEdge = edgesByPos.get(currentPos).get(currentEdgeIndex);

      charAttr.append((char) currentEdge.value);

      incAttr.setPositionIncrement(currentEdgeIndex == 0 ? 1 : 0);

      lenAttr.setPositionLength(currentEdge.destination - currentPos);

      offAttr.setOffset(currentPos, currentEdge.destination);

      currentEdgeIndex++;

      return true;
    }

    @Override
    public void reset() throws IOException {
      super.reset();
      clearAttributes();
      currentPos = 0;
      currentEdgeIndex = 0;
    }

    @Override
    public void end() throws IOException {
      clearAttributes();
      incAttr.setPositionIncrement(0);
      // -1 because we don't count the terminal state as a position in the TokenStream
      offAttr.setOffset(edgesByPos.size() - 1, edgesByPos.size() - 1);
    }
  }

  /** Edge between position nodes. These edges will be output as tokens in the TokenStream */
  private static class EdgeToken {
    public final int destination;
    public final int value;

    public EdgeToken(int destination, int value) {
      this.destination = destination;
      this.value = value;
    }
  }

  /** Node that contains original node id and position in TokenStream */
  private static class RemapNode {
    public final int id;
    public final int pos;

    public RemapNode(int id, int pos) {
      this.id = id;
      this.pos = pos;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy