com.bigdata.search.TermCompletionAnalyzer Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on May 8, 2014 by Jeremy J. Carroll, Syapse Inc.
*/
package com.bigdata.search;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* An analyzer intended for the term-completion use case; particularly
* for technical vocabularies and concept schemes.
*
*
* This analyzer generates several index terms for each word in the input.
* These are intended to match short sequences (e.g. three or more) characters
* of user-input, to then give the user a drop-down list of matching terms.
*
* This can be set up to address issues like matching half-time
when the user types
* tim
or if the user types halft
(treating the hyphen as a soft hyphen); or
* to match TermCompletionAnalyzer
when the user types Ana
*
* In contrast, the Lucene Analyzers are mainly geared around the free text search use
* case.
*
* The intended use cases will typical involve a prefix query of the form:
*
* ?t bds:search "prefix*" .
*
* to find all literals in the selected graphs, which are indexed by a term starting in prefix
,
* so the problem this class addresses is finding the appropriate index terms to allow
* matching, at sensible points, mid-way through words (such as at hyphens).
*
* To get maximum effectiveness it maybe best to use private language subtags (see RFC 5647),
* e.g. "x-term"
* which are mapped to this class by {@link ConfigurableAnalyzerFactory} for
* the data being loaded into the store, and linked to some very simple process
* like {@link KeywordAnalyzer} for queries which are tagged with a different language tag
* that is only used for bds:search
, e.g. "x-query"
.
* The above prefix query then becomes:
*
* ?t bds:search "prefix*"@x-query .
*
*
*
*
* @author jeremycarroll
*
*/
public class TermCompletionAnalyzer extends Analyzer {
private final Pattern wordBoundary;
private final Pattern subWordBoundary;
private final Pattern discard;
private final boolean alwaysDiscard;
/**
* Divide the input into words and short tokens
* as with {@link #TermCompletionAnalyzer(Pattern, Pattern)}.
* Each term is generated, and then an additional term
* is generated with softHypens (defined by the pattern),
* removed. If the alwaysRemoveSoftHypens flag is true,
* then the first term (before the removal) is suppressed.
*
* @param wordBoundary The definition of space (e.g. " ")
* @param subWordBoundary Also index after matches to this (e.g. "-")
* @param softHyphens Discard these characters from matches
* @param alwaysRemoveSoftHypens If false the discard step is optional.
*/
public TermCompletionAnalyzer(Pattern wordBoundary,
Pattern subWordBoundary,
Pattern softHyphens,
boolean alwaysRemoveSoftHypens) {
this.wordBoundary = wordBoundary;
this.subWordBoundary = subWordBoundary;
if (softHyphens != null) {
discard = softHyphens;
alwaysDiscard = alwaysRemoveSoftHypens;
} else {
discard = Pattern.compile("(?!)"); // never matches
alwaysDiscard = true;
}
}
/**
* Divide the input into words, separated by the wordBoundary,
* and return a token for each whole word, and then
* generate further tokens for each word by removing prefixes
* up to and including each successive match of
* subWordBoundary
* @param wordBoundary
* @param subWordBoundary
*/
public TermCompletionAnalyzer(Pattern wordBoundary,
Pattern subWordBoundary) {
this(wordBoundary, subWordBoundary, null, true);
}
/**
* This classes has three processes going on
* all driven from the {@link #increment()} method.
*
* One process is that of iterating over the words in the input:
* - the words are identified in the constructor, and the iteration
* is performed by {@link #nextWord()}
*
* - the subword boundaries are identified in {@link #next()}
* We then set up {@link #found} to contain the most
* recently found subword.
*
* - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()}
*
* - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard}
* can be set to null to return the non-discarded version on the next cycle.
*
*/
private class TermCompletionTokenStream extends TokenStream {
final String[] words;
final CharTermAttribute termAtt;
char currentWord[] = new char[]{};
Matcher softMatcher;
int currentWordIx = -1;
int charPos = 0;
private String afterDiscard;
private CharBuffer found;
public TermCompletionTokenStream(final Reader reader) {
termAtt = addAttribute(CharTermAttribute.class);
words = wordBoundary.split(getStringReaderContents(reader));
}
@Override
public boolean incrementToken() throws IOException {
if ( next() ) {
if (afterDiscard != null) {
int lg = afterDiscard.length();
afterDiscard.getChars(0, lg, termAtt.buffer(), 0);
termAtt.setLength(lg);
} else {
int lg = found.length();
found.get(termAtt.buffer(), 0, lg);
termAtt.setLength(lg);
}
return true;
} else {
return false;
}
}
private boolean next() {
if (currentWordIx >= words.length) {
return false;
}
if (!alwaysDiscard) {
// Last match was the discarded version,
// now do the non-discard version.
if (afterDiscard != null) {
afterDiscard = null;
return true;
}
}
afterDiscard = null;
if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) {
charPos = softMatcher.end();
maybeDiscardHyphens();
return true;
} else {
return nextWord();
}
}
void maybeDiscardHyphens() {
found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos);
Matcher discarding = discard.matcher(found);
if (discarding.find()) {
afterDiscard = discarding.replaceAll("");
}
}
private boolean nextWord() {
currentWordIx++;
if (currentWordIx >= words.length) {
return false;
}
currentWord = words[currentWordIx].toCharArray();
termAtt.resizeBuffer(currentWord.length);
charPos = 0;
softMatcher = subWordBoundary.matcher(words[currentWordIx]);
maybeDiscardHyphens();
return true;
}
}
static String getStringReaderContents(Reader reader) {
try {
reader.mark(Integer.MAX_VALUE);
int length = (int) reader.skip(Integer.MAX_VALUE);
reader.reset();
char fileContent[] = new char[length];
reader.read(fileContent);
reader.reset();
return new String(fileContent);
} catch (IOException e) {
throw new RuntimeException("Impossible",e);
}
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
return new TokenStreamComponents(source){
private Reader reader;
@Override
protected void setReader(Reader reader) {
this.reader = reader;
super.setReader(reader);
}
@Override
public TokenStream getTokenStream() {
return new TermCompletionTokenStream(reader);
}
};
}
}