org.codelibs.elasticsearch.extension.analysis.NGramSynonymTokenizer Maven / Gradle / Ivy
package org.codelibs.elasticsearch.extension.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.FST;
// https://issues.apache.org/jira/browse/LUCENE-5252
public final class NGramSynonymTokenizer extends Tokenizer {
public static final int DEFAULT_N_SIZE = 2;
public static final String DEFAULT_DELIMITERS = " \t\n\r";
static final int BUFFER_SIZE = 4096;
private final int n;
private final String delimiters;
private final boolean expand;
private final boolean expandNgram;
private final boolean ignoreCase;
private final SynonymLoader synonymLoader;
private long lastModified;
private SynonymMap synonymMap = null;
private FST.Arc scratchArc;
private FST fst;
private FST.BytesReader fstReader;
private final BytesRef scratchBytes = new BytesRef();
private final CharsRef scratchChars = new CharsRef();
private int longestMatchEndOffset;
private int ch;
private final char[] readBuffer;
private int readBufferIndex;
private int readBufferLen;
StringBuilder block;
int blkStart;
int nextBlkStart;
private int finalOffset;
private final PriorityQueue queue;
private MyToken prevToken;
private final List synonyms;
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAttr = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class);
protected NGramSynonymTokenizer(final int n, final String delimiters, final boolean expand, final boolean expandNgram,
final boolean ignoreCase, final SynonymLoader synonymLoader) {
this.n = n;
this.delimiters = delimiters;
this.expand = expand;
this.expandNgram = expandNgram;
this.ignoreCase = ignoreCase;
if (synonymLoader != null) {
if (synonymLoader.isReloadable()) {
this.synonymLoader = synonymLoader;
this.lastModified = synonymLoader.getLastModified();
} else {
this.synonymLoader = null;
this.lastModified = System.currentTimeMillis();
}
synonymMap = synonymLoader.getSynonymMap();
if (synonymMap != null && synonymMap.fst == null) {
this.synonymMap = null;
}
} else {
this.synonymLoader = null;
}
if (synonymMap != null) {
this.fst = synonymMap.fst;
this.fstReader = fst.getBytesReader();
scratchArc = new FST.Arc<>();
}
ch = 0;
readBuffer = new char[BUFFER_SIZE];
readBufferIndex = BUFFER_SIZE;
readBufferLen = 0;
block = new StringBuilder();
nextBlkStart = 0;
queue = new PriorityQueue<>(100,
new MyTokensComparator());
this.synonyms = new ArrayList<>();
}
@Override
public boolean incrementToken() throws IOException {
while (true) {
final MyToken nextToken = getNextUniqueToken(queue, prevToken);
if (nextToken == null) {
getNextBlock();
if (block.length() == 0) {
return false;
}
consultDictionary();
tokenizeWholeBlock();
} else {
prevToken = nextToken;
clearAttributes();
termAttr.append(nextToken.word);
finalOffset = correctOffset(blkStart + nextToken.endOffset);
offsetAttr.setOffset(correctOffset(blkStart
+ nextToken.startOffset), finalOffset);
posIncAttr.setPositionIncrement(nextToken.posInc);
return true;
}
}
}
static MyToken getNextUniqueToken(final PriorityQueue que, final MyToken prev) {
while (true) {
final MyToken token = que.poll();
if (token == null) {
return null;
}
if (prev == null || !prev.identical(token)) {
return token;
}
}
}
void consultDictionary() throws IOException {
if (synonymMap == null) {
return;
}
synonyms.clear();
final char[] key = block.toString().toCharArray();
for (int start = 0; start < block.length();) {
final BytesRef matchOutput = getLongestMatchOutput(key, start);
if (matchOutput == null) {
start++;
continue;
}
synonyms.add(new MyToken(key, start, longestMatchEndOffset, 1,
matchOutput.clone(), ignoreCase));
start = longestMatchEndOffset;
}
}
BytesRef getLongestMatchOutput(final char[] src, final int start) throws IOException {
BytesRef pendingOutput = fst.outputs.getNoOutput();
fst.getFirstArc(scratchArc);
BytesRef matchOutput = null;
int index = 0;
while (start + index < src.length) {
final int codePoint = Character.codePointAt(src, start + index,
src.length);
if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint)
: codePoint, scratchArc, scratchArc, fstReader) == null) {
return matchOutput;
}
pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output());
if (scratchArc.isFinal()) {
matchOutput = fst.outputs.add(pendingOutput,
scratchArc.nextFinalOutput());
longestMatchEndOffset = start + index
+ Character.charCount(codePoint);
}
index += Character.charCount(codePoint);
}
return matchOutput;
}
void tokenizeWholeBlock() {
queue.clear();
int nextStart = 0;
final int end = block.length();
boolean afterSynonymProduced = false;
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
for (int idx = 0; idx < synonyms.size(); idx++) {
final MyToken synonym = synonyms.get(idx);
tokenizePartialBlock(nextStart, synonym.startOffset,
afterSynonymProduced);
// enqueue prev-synonym
processPrevSynonym(synonym.startOffset, idx > 0 ? synonyms.get(idx - 1).endOffset : 0);
if (expand) {
queue.add(synonym);
}
// enqueue synonyms
bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
final int code = bytesReader.readVInt();
final int count = code >>> 1;
for (int i = 0; i < count; i++) {
synonymMap.words.get(bytesReader.readVInt(), scratchBytes);
if (scratchChars.chars.length < scratchBytes.length) {
scratchChars.chars = new char[scratchBytes.length];
}
scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
final String word = scratchChars.toString();
int posInc = 0;
int seq = i + 1;
if (synonym.word.equals(word)) {
posInc = 1;
seq = 0;
} else if (!expand) {
posInc = 1;
}
queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
if (!expand) {
break;
}
}
// enqueue after-synonym
afterSynonymProduced =
processAfterSynonym(synonym.endOffset, idx < synonyms.size() - 1 ? synonyms.get(idx + 1).startOffset : block.length());
nextStart = synonym.endOffset;
}
tokenizePartialBlock(nextStart, end, afterSynonymProduced);
if (expandNgram) {
tokenizeWholeBlockWithNGram();
}
}
void tokenizeWholeBlockWithNGram() {
int i = 0;
int j = 0;
final MyToken[] tokens = queue.toArray(new MyToken[queue.size()]);
queue.clear();
while (i + n <= block.length() && j < tokens.length) {
final MyToken synonymToken = tokens[j];
if (synonymToken.startOffset > i) {
queue.add(new MyToken(block.substring(i, i + n), i, i + n, 0));
i++;
} else if (synonymToken.startOffset < i) {
queue.add(synonymToken);
j++;
} else {
queue.add(synonymToken);
final String word = block.substring(i, i + n);
if (!synonymToken.word.equals(word)) {
queue.add(new MyToken(word, i, i + n, 0));
}
i++;
j++;
}
}
while (j < tokens.length) {
queue.add(tokens[j]);
j++;
}
while (i + n <= block.length()) {
queue.add(new MyToken(block.substring(i, i + n), i, i + n, 0));
i++;
}
}
void tokenizePartialBlock(final int startOffset, final int endOffset,
final boolean afterSynonymProduced) {
if (startOffset >= endOffset) {
return;
}
int posInc = afterSynonymProduced ? 0 : 1;
if (endOffset - startOffset < n) {
queue.add(new MyToken(block.substring(startOffset, endOffset),
startOffset, endOffset, posInc));
return;
}
for (int i = startOffset; i + n <= endOffset; i++) {
queue.add(new MyToken(block.substring(i, i + n), i, i + n, posInc));
posInc = 1;
}
}
void processPrevSynonym(final int endOffset, final int limitOffset) {
int startOffset = endOffset - 1;
for (int len = 1; len < n && startOffset >= limitOffset; len++) {
queue.add(new MyToken(block.substring(startOffset, endOffset),
startOffset, endOffset, 0));
startOffset--;
}
}
boolean processAfterSynonym(final int startOffset, final int limitOffset) {
final int qSize = queue.size();
int endOffset = startOffset + 1;
int posInc = 1;
for (int len = 1; len < n && endOffset <= limitOffset; len++) {
queue.add(new MyToken(block.substring(startOffset, endOffset),
startOffset, endOffset, posInc));
endOffset++;
posInc = 0;
}
return queue.size() > qSize;
}
@Override
public void end() throws IOException {
super.end();
offsetAttr.setOffset(finalOffset, finalOffset);
}
@Override
public void reset() throws IOException {
super.reset();
block.setLength(0);
prevToken = null;
readBufferIndex = BUFFER_SIZE;
readBufferLen = 0;
ch = 0;
blkStart = 0;
nextBlkStart = 0;
if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) {
lastModified = synonymLoader.getLastModified();
final SynonymMap map = synonymLoader.getSynonymMap();
if (map != null) {
synonymMap = map;
fst = synonymMap.fst;
if (fst == null) {
throw new IllegalArgumentException("fst must be non-null");
}
fstReader = fst.getBytesReader();
scratchArc = new FST.Arc<>();
clearAttributes();
}
}
}
boolean getNextBlock() throws IOException {
blkStart = nextBlkStart;
block.setLength(0);
prevToken = null;
while (true) {
if (ch != -1) {
ch = readCharFromBuffer();
}
if (ch == -1) {
break;
} else if (!isDelimiter(ch)) {
block.append((char) ch);
} else if (block.length() > 0) {
break;
} else {
blkStart++;
}
}
if (block.length() == 0) {
return false;
}
return true;
}
int readCharFromBuffer() throws IOException {
if (readBufferIndex >= readBufferLen) {
readBufferLen = input.read(readBuffer);
if (readBufferLen == -1) {
return -1;
}
readBufferIndex = 0;
}
final int c = readBuffer[readBufferIndex++];
nextBlkStart++;
return c;
}
boolean isDelimiter(final int c) {
return delimiters.indexOf(c) >= 0;
}
static class MyToken {
final String word;
final int startOffset, endOffset, posInc, seq;
final BytesRef output;
public MyToken(final char[] key, final int startOffset, final int endOffset, final int posInc,
final BytesRef output, final boolean ignoreCase) {
this.word = ignoreCase ? new String(key, startOffset, endOffset
- startOffset).toLowerCase() : new String(key, startOffset,
endOffset - startOffset);
this.startOffset = startOffset;
this.endOffset = endOffset;
this.posInc = posInc;
this.output = output;
this.seq = 0; // zero for seq means that this token is the original of synonyms
}
public MyToken(final String word, final int startOffset, final int endOffset, final int posInc) {
this(word, startOffset, endOffset, posInc, Integer.MAX_VALUE); // Integer.MAX_VALUE for seq means unused
}
public MyToken(final String word, final int startOffset, final int endOffset, final int posInc,
final int seq) {
this.word = word;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.posInc = posInc;
this.output = null; // means unused
this.seq = seq;
}
public boolean identical(final MyToken o) {
if (o.posInc != 0) {
return false;
}
if (!word.equals(o.word)) {
return false;
}
if (startOffset != o.startOffset) {
return false;
}
if (endOffset != o.endOffset) {
return false;
}
return true;
}
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append(word).append(',').append(startOffset).append(',')
.append(endOffset).append(',').append(posInc);
return sb.toString();
}
@Override
public boolean equals(final Object other) {
if (other == null || !(other instanceof MyToken)) {
return false;
}
final MyToken o = (MyToken) other;
if (!word.equals(o.word)) {
return false;
}
if (startOffset != o.startOffset) {
return false;
}
if (endOffset != o.endOffset) {
return false;
}
if (posInc != o.posInc) {
return false;
}
return true;
}
@Override
public int hashCode() {
return word.hashCode() + posInc << 30 + startOffset << 15 + endOffset;
}
}
/*
static class SynInfo {
final String src;
final int offset, length;
final String[] synonyms;
Mode mode;
int count;
SynInfo(String src, int offset, int length, String[] synonyms){
this.src = src;
this.offset = offset;
this.length = length;
this.synonyms = synonyms;
}
static enum Mode {
PREV, SYN, AFTER;
}
}
*/
static class MyTokensComparator implements Comparator {
@Override
public int compare(final MyToken t1, final MyToken t2) {
if (t1.startOffset < t2.startOffset) {
return -1;
} else if (t1.startOffset > t2.startOffset) {
return 1;
}
if (t1.posInc > t2.posInc) {
return -1;
} else if (t1.posInc < t2.posInc) {
return 1;
}
if (t1.endOffset < t2.endOffset) {
return -1;
} else if (t1.endOffset > t2.endOffset) {
return 1;
}
if (t1.seq < t2.seq) {
return -1;
} else if (t1.seq > t2.seq) {
return 1;
}
return -1;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy