org.codelibs.opensearch.extension.analysis.NGramSynonymTokenizer Maven / Gradle / Ivy

Go to download
package org.codelibs.opensearch.extension.analysis;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;

// https://issues.apache.org/jira/browse/LUCENE-5252
public final class NGramSynonymTokenizer extends Tokenizer {

    public static final int DEFAULT_N_SIZE = 2;

    public static final String DEFAULT_DELIMITERS = " 　\t\n\r";

    static final int BUFFER_SIZE = 4096;

    private final int n;

    private final String delimiters;

    private final boolean expand;

    private final boolean ignoreCase;

    private final SynonymLoader synonymLoader;

    private long lastModified;

    private SynonymMap synonymMap = null;

    private FST.Arc scratchArc;

    private FST fst;

    private FST.BytesReader fstReader;

    private final BytesRef scratchBytes = new BytesRef();

    private final CharsRef scratchChars = new CharsRef();

    private int longestMatchEndOffset;

    private int ch;

    private final char[] readBuffer;

    private int readBufferIndex;

    private int readBufferLen;

    StringBuilder block;

    int blkStart;

    int nextBlkStart;

    private int finalOffset;

    private final PriorityQueue queue;

    private MyToken prevToken;

    private final List synonyms;

    private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);

    private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);

    private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);

    protected NGramSynonymTokenizer(final int n, final String delimiters, final boolean expand, final boolean ignoreCase,
            final SynonymLoader synonymLoader) {
        this.n = n;
        this.delimiters = delimiters;
        this.expand = expand;
        this.ignoreCase = ignoreCase;
        if (synonymLoader != null) {
            if (synonymLoader.isReloadable()) {
                this.synonymLoader = synonymLoader;
                this.lastModified = synonymLoader.getLastModified();
            } else {
                this.synonymLoader = null;
                this.lastModified = System.currentTimeMillis();
            }
            synonymMap = synonymLoader.getSynonymMap();
            if (synonymMap != null && synonymMap.fst == null) {
                this.synonymMap = null;
            }
        } else {
            this.synonymLoader = null;
        }
        if (synonymMap != null) {
            this.fst = synonymMap.fst;
            this.fstReader = fst.getBytesReader();
            scratchArc = new FST.Arc<>();
        }

        ch = 0;
        readBuffer = new char[BUFFER_SIZE];
        readBufferIndex = BUFFER_SIZE;
        readBufferLen = 0;
        block = new StringBuilder();
        nextBlkStart = 0;
        queue = new PriorityQueue<>(100, new MyTokensComparator());
        this.synonyms = new ArrayList<>();
    }

    @Override
    public boolean incrementToken() throws IOException {
        while (true) {
            final MyToken nextToken = getNextUniqueToken(queue, prevToken);
            if (nextToken == null) {
                getNextBlock();
                if (block.length() == 0) {
                    return false;
                }
                consultDictionary();
                tokenizeWholeBlock();
            } else {
                prevToken = nextToken;
                clearAttributes();
                termAttr.append(nextToken.word);
                finalOffset = correctOffset(blkStart + nextToken.endOffset);
                offsetAttr.setOffset(correctOffset(blkStart + nextToken.startOffset), finalOffset);
                posIncAttr.setPositionIncrement(nextToken.posInc);
                return true;
            }
        }
    }

    static MyToken getNextUniqueToken(final PriorityQueue que, final MyToken prev) {
        while (true) {
            final MyToken token = que.poll();
            if (token == null) {
                return null;
            }
            if (prev == null || !prev.identical(token)) {
                return token;
            }
        }
    }

    void consultDictionary() throws IOException {
        if (synonymMap == null) {
            return;
        }
        synonyms.clear();
        final char[] key = block.toString().toCharArray();
        for (int start = 0; start < block.length();) {
            final BytesRef matchOutput = getLongestMatchOutput(key, start);
            if (matchOutput == null) {
                start++;
                continue;
            }

            synonyms.add(new MyToken(key, start, longestMatchEndOffset, 1, matchOutput.clone(), ignoreCase)); // TODO synonym
            start = longestMatchEndOffset;
        }
    }

    BytesRef getLongestMatchOutput(final char[] src, final int start) throws IOException {
        BytesRef pendingOutput = fst.outputs.getNoOutput();
        fst.getFirstArc(scratchArc);
        BytesRef matchOutput = null;

        int index = 0;
        while (start + index < src.length) {
            final int codePoint = Character.codePointAt(src, start + index, src.length);
            if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) {
                return matchOutput;
            }

            pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());

            if (scratchArc.isFinal()) {
                matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput());
                longestMatchEndOffset = start + index + Character.charCount(codePoint);
            }

            index += Character.charCount(codePoint);
        }

        return matchOutput;
    }

    void tokenizeWholeBlock() {
        queue.clear();
        int nextStart = 0;
        final int end = block.length();
        boolean afterSynonymProduced = false;
        final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
        for (int idx = 0; idx < synonyms.size(); idx++) {
            final MyToken synonym = synonyms.get(idx);
            tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

            // enqueue prev-synonym
            processPrevSynonym(synonym.startOffset, idx > 0 ? synonyms.get(idx - 1).endOffset : 0);

            if (expand) {
                queue.add(synonym);
            }

            // enqueue synonyms
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                synonymMap.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                final String word = scratchChars.toString();
                int posInc = 0;
                if (!expand) {
                    posInc = 1;
                } else if (synonym.word.equals(word)) {
                    continue;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, i + 1));
                if (!expand) {
                    break;
                }
            }

            // enqueue after-synonym
            afterSynonymProduced =
                    processAfterSynonym(synonym.endOffset, idx < synonyms.size() - 1 ? synonyms.get(idx + 1).startOffset : block.length());

            nextStart = synonym.endOffset;
        }
        tokenizePartialBlock(nextStart, end, afterSynonymProduced);
    }

    void tokenizePartialBlock(final int startOffset, final int endOffset, final boolean afterSynonymProduced) {
        if (startOffset >= endOffset) {
            return;
        }

        int posInc = afterSynonymProduced ? 0 : 1;
        if (endOffset - startOffset < n) {
            queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, posInc));
            return;
        }

        for (int i = startOffset; i + n <= endOffset; i++) {
            queue.add(new MyToken(block.substring(i, i + n), i, i + n, posInc));
            posInc = 1;
        }
    }

    void processPrevSynonym(final int endOffset, final int limitOffset) {
        int startOffset = endOffset - 1;
        for (int len = 1; len < n && startOffset >= limitOffset; len++) {
            queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, 0));
            startOffset--;
        }
    }

    boolean processAfterSynonym(final int startOffset, final int limitOffset) {
        final int qSize = queue.size();
        int endOffset = startOffset + 1;
        int posInc = 1;
        for (int len = 1; len < n && endOffset <= limitOffset; len++) {
            queue.add(new MyToken(block.substring(startOffset, endOffset), startOffset, endOffset, posInc));
            endOffset++;
            posInc = 0;
        }
        return queue.size() > qSize;
    }

    @Override
    public void end() throws IOException {
        super.end();
        offsetAttr.setOffset(finalOffset, finalOffset);
    }

    @Override
    public void reset() throws IOException {
        super.reset();
        block.setLength(0);
        prevToken = null;
        readBufferIndex = BUFFER_SIZE;
        readBufferLen = 0;
        ch = 0;
        blkStart = 0;
        nextBlkStart = 0;
        if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) {
            lastModified = synonymLoader.getLastModified();
            final SynonymMap map = synonymLoader.getSynonymMap();
            if (map != null) {
                synonymMap = map;
                fst = synonymMap.fst;
                if (fst == null) {
                    throw new IllegalArgumentException("fst must be non-null");
                }
                fstReader = fst.getBytesReader();
                scratchArc = new FST.Arc<>();
                clearAttributes();
            }
        }
    }

    boolean getNextBlock() throws IOException {
        blkStart = nextBlkStart;
        block.setLength(0);
        prevToken = null;
        while (true) {
            if (ch != -1) {
                ch = readCharFromBuffer();
            }
            if (ch == -1) {
                break;
            } else if (!isDelimiter(ch)) {
                block.append((char) ch);
            } else if (block.length() > 0) {
                break;
            } else {
                blkStart++;
            }
        }
        if (block.length() == 0) {
            return false;
        }
        return true;
    }

    int readCharFromBuffer() throws IOException {
        if (readBufferIndex >= readBufferLen) {
            readBufferLen = input.read(readBuffer);
            if (readBufferLen == -1) {
                return -1;
            }
            readBufferIndex = 0;
        }
        final int c = readBuffer[readBufferIndex++];
        nextBlkStart++;
        return c;
    }

    boolean isDelimiter(final int c) {
        return delimiters.indexOf(c) >= 0;
    }

    static class MyToken {
        final String word;

        final int startOffset, endOffset, posInc, seq;

        final BytesRef output;

        public MyToken(final char[] key, final int startOffset, final int endOffset, final int posInc, final BytesRef output,
                final boolean ignoreCase) {
            this.word = ignoreCase ? new String(key, startOffset, endOffset - startOffset).toLowerCase()
                    : new String(key, startOffset, endOffset - startOffset);
            this.startOffset = startOffset;
            this.endOffset = endOffset;
            this.posInc = posInc;
            this.output = output;
            this.seq = 0; // zero for seq means that this token is the original of synonyms
        }

        public MyToken(final String word, final int startOffset, final int endOffset, final int posInc) {
            this(word, startOffset, endOffset, posInc, Integer.MAX_VALUE); // Integer.MAX_VALUE for seq means unused
        }

        public MyToken(final String word, final int startOffset, final int endOffset, final int posInc, final int seq) {
            this.word = word;
            this.startOffset = startOffset;
            this.endOffset = endOffset;
            this.posInc = posInc;
            this.output = null; // means unused
            this.seq = seq;
        }

        public boolean identical(final MyToken o) {
            if (o.posInc != 0) {
                return false;
            }
            if (!word.equals(o.word)) {
                return false;
            }
            if (startOffset != o.startOffset) {
                return false;
            }
            if (endOffset != o.endOffset) {
                return false;
            }
            return true;
        }

        @Override
        public String toString() {
            final StringBuilder sb = new StringBuilder();
            sb.append(word).append(',').append(startOffset).append(',').append(endOffset).append(',').append(posInc);
            return sb.toString();
        }

        @Override
        public boolean equals(final Object other) {
            if (other == null || !(other instanceof MyToken)) {
                return false;
            }
            final MyToken o = (MyToken) other;
            if (!word.equals(o.word)) {
                return false;
            }
            if (startOffset != o.startOffset) {
                return false;
            }
            if (endOffset != o.endOffset) {
                return false;
            }
            if (posInc != o.posInc) {
                return false;
            }
            return true;
        }

        @Override
        public int hashCode() {
            return word.hashCode() + posInc << 30 + startOffset << 15 + endOffset;
        }
    }

    /*
      static class SynInfo {
        final String src;
        final int offset, length;
        final String[] synonyms;
        Mode mode;
        int count;
        SynInfo(String src, int offset, int length, String[] synonyms){
          this.src = src;
          this.offset = offset;
          this.length = length;
          this.synonyms = synonyms;
        }
    
        static enum Mode {
          PREV, SYN, AFTER;
        }
      }
      */

    static class MyTokensComparator implements Comparator {
        @Override
        public int compare(final MyToken t1, final MyToken t2) {
            if (t1.startOffset < t2.startOffset) {
                return -1;
            } else if (t1.startOffset > t2.startOffset) {
                return 1;
            }

            if (t1.endOffset < t2.endOffset) {
                return -1;
            } else if (t1.endOffset > t2.endOffset) {
                return 1;
            }

            if (t1.posInc > t2.posInc) {
                return -1;
            } else if (t1.posInc < t2.posInc) {
                return 1;
            }

            if (t1.seq < t2.seq) {
                return -1;
            } else if (t1.seq > t2.seq) {
                return 1;
            }

            return -1;
        }
    }
}