All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.miscellaneous.DeDuplicatingTokenFilter Maven / Gradle / Ivy

There is a newer version: 8.13.2
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.lucene.analysis.miscellaneous;

import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.hash.MurmurHash3;

import java.io.IOException;
import java.util.ArrayList;

/**
 * Inspects token streams for duplicate sequences of tokens. Token sequences
 * have a minimum length - 6 is a good heuristic as it avoids filtering common
 * idioms/phrases but detects longer sections that are typical of cut+paste
 * copies of text.
 * 
 * 

* Internally each token is hashed/moduloed into a single byte (so 256 possible * values for each token) and then recorded in a trie of seen byte sequences * using a {@link DuplicateByteSequenceSpotter}. This trie is passed into the * TokenFilter constructor so a single object can be reused across multiple * documents. * *

* The emitDuplicates setting controls if duplicate tokens are filtered from * results or are output (the {@link DuplicateSequenceAttribute} attribute can * be used to inspect the number of prior sightings when emitDuplicates is true) */ public class DeDuplicatingTokenFilter extends FilteringTokenFilter { private final DuplicateSequenceAttribute seqAtt = addAttribute(DuplicateSequenceAttribute.class); private final boolean emitDuplicates; static final MurmurHash3.Hash128 seed = new MurmurHash3.Hash128(); public DeDuplicatingTokenFilter(TokenStream in, DuplicateByteSequenceSpotter byteStreamDuplicateSpotter) { this(in, byteStreamDuplicateSpotter, false); } /** * * @param in * The input token stream * @param byteStreamDuplicateSpotter * object which retains trie of token sequences * @param emitDuplicates * true if duplicate tokens are to be emitted (use * {@link DuplicateSequenceAttribute} attribute to inspect number * of prior sightings of tokens as part of a sequence). */ public DeDuplicatingTokenFilter(TokenStream in, DuplicateByteSequenceSpotter byteStreamDuplicateSpotter, boolean emitDuplicates) { super(new DuplicateTaggingFilter(byteStreamDuplicateSpotter, in)); this.emitDuplicates = emitDuplicates; } @Override protected boolean accept() throws IOException { return emitDuplicates || seqAtt.getNumPriorUsesInASequence() < 1; } private static class DuplicateTaggingFilter extends TokenFilter { private final DuplicateSequenceAttribute seqAtt = addAttribute(DuplicateSequenceAttribute.class); TermToBytesRefAttribute termBytesAtt = addAttribute(TermToBytesRefAttribute.class); private DuplicateByteSequenceSpotter byteStreamDuplicateSpotter; private ArrayList allTokens; int pos = 0; private final int windowSize; protected DuplicateTaggingFilter(DuplicateByteSequenceSpotter byteStreamDuplicateSpotter, TokenStream input) { super(input); this.byteStreamDuplicateSpotter = byteStreamDuplicateSpotter; this.windowSize = DuplicateByteSequenceSpotter.TREE_DEPTH; } @Override public final boolean incrementToken() throws IOException { if (allTokens == null) { loadAllTokens(); } clearAttributes(); if (pos < allTokens.size()) { State earlierToken = allTokens.get(pos); pos++; restoreState(earlierToken); return true; } else { return false; } } public void loadAllTokens() throws IOException { // TODO consider changing this implementation to emit tokens as-we-go // rather than buffering all. However this array is perhaps not the // bulk of memory usage (in practice the dupSequenceSpotter requires // ~5x the original content size in its internal tree ). allTokens = new ArrayList(256); /* * Given the bytes 123456123456 and a duplicate sequence size of 6 * the byteStreamDuplicateSpotter will only flag the final byte as * part of a duplicate sequence due to the byte-at-a-time streaming * nature of its assessments. When this happens we retain a buffer * of the last 6 tokens so that we can mark the states of prior * tokens (bytes 7 to 11) as also being duplicates */ pos = 0; boolean isWrapped = false; State priorStatesBuffer[] = new State[windowSize]; short priorMaxNumSightings[] = new short[windowSize]; int cursor = 0; while (input.incrementToken()) { BytesRef bytesRef = termBytesAtt.getBytesRef(); long tokenHash = MurmurHash3.hash128(bytesRef.bytes, bytesRef.offset, bytesRef.length, 0, seed).h1; byte tokenByte = (byte) (tokenHash & 0xFF); short numSightings = byteStreamDuplicateSpotter.addByte(tokenByte); priorStatesBuffer[cursor] = captureState(); // Revise prior captured State objects if the latest // token is marked as a duplicate if (numSightings >= 1) { int numLengthsToRecord = windowSize; int pos = cursor; while (numLengthsToRecord > 0) { if (pos < 0) { pos = windowSize - 1; } priorMaxNumSightings[pos] = (short) Math.max(priorMaxNumSightings[pos], numSightings); numLengthsToRecord--; pos--; } } // Reposition cursor to next free slot cursor++; if (cursor >= windowSize) { // wrap around the buffer cursor = 0; isWrapped = true; } // clean out the end of the tail that we may overwrite if the // next iteration adds a new head if (isWrapped) { // tokenPos is now positioned on tail - emit any valid // tokens we may about to overwrite in the next iteration if (priorStatesBuffer[cursor] != null) { recordLengthInfoState(priorMaxNumSightings, priorStatesBuffer, cursor); } } } // end loop reading all tokens from stream // Flush the buffered tokens int pos = isWrapped ? nextAfter(cursor) : 0; while (pos != cursor) { recordLengthInfoState(priorMaxNumSightings, priorStatesBuffer, pos); pos = nextAfter(pos); } } private int nextAfter(int pos) { pos++; if (pos >= windowSize) { pos = 0; } return pos; } private void recordLengthInfoState(short[] maxNumSightings, State[] tokenStates, int cursor) { if (maxNumSightings[cursor] > 0) { // We need to patch in the max sequence length we recorded at // this position into the token state restoreState(tokenStates[cursor]); seqAtt.setNumPriorUsesInASequence(maxNumSightings[cursor]); maxNumSightings[cursor] = 0; // record the patched state tokenStates[cursor] = captureState(); } allTokens.add(tokenStates[cursor]); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy