
org.apache.lucene.analysis.AutomatonToTokenStream Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.internal.hppc.IntCursor;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Transition;
/** Converts an Automaton into a TokenStream. */
public class AutomatonToTokenStream {
private AutomatonToTokenStream() {}
/**
* converts an automaton into a TokenStream. This is done by first Topo sorting the nodes in the
* Automaton. Nodes that have the same distance from the start are grouped together to form the
* position nodes for the TokenStream. The resulting TokenStream releases edges from the automaton
* as tokens in order from the position nodes. This requires the automaton be a finite DAG.
*
* @param automaton automaton to convert. Must be a finite DAG to avoid infinite loops!
* @return TokenStream representation of automaton.
*/
public static TokenStream toTokenStream(Automaton automaton) {
List positionNodes = new ArrayList<>();
Transition[][] transitions = automaton.getSortedTransitions();
int[] indegree = new int[transitions.length];
for (int i = 0; i < transitions.length; i++) {
for (int edge = 0; edge < transitions[i].length; edge++) {
indegree[transitions[i][edge].dest] += 1;
}
}
if (indegree[0] != 0) {
throw new IllegalArgumentException("Start node has incoming edges, creating cycle");
}
LinkedList noIncomingEdges = new LinkedList<>();
IntIntHashMap idToPos = new IntIntHashMap();
noIncomingEdges.addLast(new RemapNode(0, 0));
while (noIncomingEdges.isEmpty() == false) {
RemapNode currState = noIncomingEdges.removeFirst();
for (int i = 0; i < transitions[currState.id].length; i++) {
indegree[transitions[currState.id][i].dest] -= 1;
if (indegree[transitions[currState.id][i].dest] == 0) {
noIncomingEdges.addLast(
new RemapNode(transitions[currState.id][i].dest, currState.pos + 1));
}
}
if (positionNodes.size() == currState.pos) {
IntArrayList posIncs = new IntArrayList();
posIncs.add(currState.id);
positionNodes.add(posIncs);
} else {
positionNodes.get(currState.pos).add(currState.id);
}
idToPos.put(currState.id, currState.pos);
}
for (int i = 0; i < indegree.length; i++) {
if (indegree[i] != 0) {
throw new IllegalArgumentException("Cycle found in automaton");
}
}
List> edgesByLayer = new ArrayList<>();
for (IntArrayList layer : positionNodes) {
List edges = new ArrayList<>();
for (IntCursor state : layer) {
for (Transition t : transitions[state.value]) {
// each edge in the token stream can only be on value, though a transition takes a range.
for (int val = t.min; val <= t.max; val++) {
int destLayer = idToPos.get(t.dest);
edges.add(new EdgeToken(destLayer, val));
// If there's an intermediate accept state, add an edge to the terminal state.
if (automaton.isAccept(t.dest) && destLayer != positionNodes.size() - 1) {
edges.add(new EdgeToken(positionNodes.size() - 1, val));
}
}
}
}
edgesByLayer.add(edges);
}
return new TopoTokenStream(edgesByLayer);
}
/** Token Stream that outputs tokens from a topo sorted graph. */
private static class TopoTokenStream extends TokenStream {
private final List> edgesByPos;
private int currentPos;
private int currentEdgeIndex;
private CharTermAttribute charAttr = addAttribute(CharTermAttribute.class);
private PositionIncrementAttribute incAttr = addAttribute(PositionIncrementAttribute.class);
private PositionLengthAttribute lenAttr = addAttribute(PositionLengthAttribute.class);
private OffsetAttribute offAttr = addAttribute(OffsetAttribute.class);
public TopoTokenStream(List> edgesByPos) {
this.edgesByPos = edgesByPos;
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
while (currentPos < edgesByPos.size()
&& currentEdgeIndex == edgesByPos.get(currentPos).size()) {
currentEdgeIndex = 0;
currentPos += 1;
}
if (currentPos == edgesByPos.size()) {
return false;
}
EdgeToken currentEdge = edgesByPos.get(currentPos).get(currentEdgeIndex);
charAttr.append((char) currentEdge.value);
incAttr.setPositionIncrement(currentEdgeIndex == 0 ? 1 : 0);
lenAttr.setPositionLength(currentEdge.destination - currentPos);
offAttr.setOffset(currentPos, currentEdge.destination);
currentEdgeIndex++;
return true;
}
@Override
public void reset() throws IOException {
super.reset();
clearAttributes();
currentPos = 0;
currentEdgeIndex = 0;
}
@Override
public void end() throws IOException {
clearAttributes();
incAttr.setPositionIncrement(0);
// -1 because we don't count the terminal state as a position in the TokenStream
offAttr.setOffset(edgesByPos.size() - 1, edgesByPos.size() - 1);
}
}
/** Edge between position nodes. These edges will be output as tokens in the TokenStream */
private record EdgeToken(int destination, int value) {}
/** Node that contains original node id and position in TokenStream */
private record RemapNode(int id, int pos) {}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy