All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.search.TermCompletionAnalyzer Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 8, 2014 by Jeremy J. Carroll, Syapse Inc.
 */
package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.CharBuffer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;


/**
 * An analyzer intended for the term-completion use case; particularly
 * for technical vocabularies and concept schemes.
 * 
 * 

* This analyzer generates several index terms for each word in the input. * These are intended to match short sequences (e.g. three or more) characters * of user-input, to then give the user a drop-down list of matching terms. *

* This can be set up to address issues like matching half-time when the user types * tim or if the user types halft (treating the hyphen as a soft hyphen); or * to match TermCompletionAnalyzer when the user types Ana *

* In contrast, the Lucene Analyzers are mainly geared around the free text search use * case. *

* The intended use cases will typical involve a prefix query of the form: *

 *    ?t bds:search "prefix*" .
 * 
* to find all literals in the selected graphs, which are indexed by a term starting in prefix, * so the problem this class addresses is finding the appropriate index terms to allow * matching, at sensible points, mid-way through words (such as at hyphens). *

* To get maximum effectiveness it maybe best to use private language subtags (see RFC 5647), * e.g. "x-term" * which are mapped to this class by {@link ConfigurableAnalyzerFactory} for * the data being loaded into the store, and linked to some very simple process * like {@link KeywordAnalyzer} for queries which are tagged with a different language tag * that is only used for bds:search, e.g. "x-query". * The above prefix query then becomes: *

 *    ?t bds:search "prefix*"@x-query .
 * 
* * * * @author jeremycarroll * */ public class TermCompletionAnalyzer extends Analyzer { private final Pattern wordBoundary; private final Pattern subWordBoundary; private final Pattern discard; private final boolean alwaysDiscard; /** * Divide the input into words and short tokens * as with {@link #TermCompletionAnalyzer(Pattern, Pattern)}. * Each term is generated, and then an additional term * is generated with softHypens (defined by the pattern), * removed. If the alwaysRemoveSoftHypens flag is true, * then the first term (before the removal) is suppressed. * * @param wordBoundary The definition of space (e.g. " ") * @param subWordBoundary Also index after matches to this (e.g. "-") * @param softHyphens Discard these characters from matches * @param alwaysRemoveSoftHypens If false the discard step is optional. */ public TermCompletionAnalyzer(Pattern wordBoundary, Pattern subWordBoundary, Pattern softHyphens, boolean alwaysRemoveSoftHypens) { this.wordBoundary = wordBoundary; this.subWordBoundary = subWordBoundary; if (softHyphens != null) { discard = softHyphens; alwaysDiscard = alwaysRemoveSoftHypens; } else { discard = Pattern.compile("(?!)"); // never matches alwaysDiscard = true; } } /** * Divide the input into words, separated by the wordBoundary, * and return a token for each whole word, and then * generate further tokens for each word by removing prefixes * up to and including each successive match of * subWordBoundary * @param wordBoundary * @param subWordBoundary */ public TermCompletionAnalyzer(Pattern wordBoundary, Pattern subWordBoundary) { this(wordBoundary, subWordBoundary, null, true); } /** * This classes has three processes going on * all driven from the {@link #increment()} method. * * One process is that of iterating over the words in the input: * - the words are identified in the constructor, and the iteration * is performed by {@link #nextWord()} * * - the subword boundaries are identified in {@link #next()} * We then set up {@link #found} to contain the most * recently found subword. * * - the soft hyphen discarding is processed in {@link #maybeDiscardHyphens()} * * - if we are not {@link #alwaysDiscard}ing then {@link #afterDiscard} * can be set to null to return the non-discarded version on the next cycle. * */ private class TermCompletionTokenStream extends TokenStream { final String[] words; final CharTermAttribute termAtt; char currentWord[] = new char[]{}; Matcher softMatcher; int currentWordIx = -1; int charPos = 0; private String afterDiscard; private CharBuffer found; public TermCompletionTokenStream(final Reader reader) { termAtt = addAttribute(CharTermAttribute.class); words = wordBoundary.split(getStringReaderContents(reader)); } @Override public boolean incrementToken() throws IOException { if ( next() ) { if (afterDiscard != null) { int lg = afterDiscard.length(); afterDiscard.getChars(0, lg, termAtt.buffer(), 0); termAtt.setLength(lg); } else { int lg = found.length(); found.get(termAtt.buffer(), 0, lg); termAtt.setLength(lg); } return true; } else { return false; } } private boolean next() { if (currentWordIx >= words.length) { return false; } if (!alwaysDiscard) { // Last match was the discarded version, // now do the non-discard version. if (afterDiscard != null) { afterDiscard = null; return true; } } afterDiscard = null; if (charPos + 1 < currentWord.length && softMatcher.find(charPos+1)) { charPos = softMatcher.end(); maybeDiscardHyphens(); return true; } else { return nextWord(); } } void maybeDiscardHyphens() { found = CharBuffer.wrap(currentWord, charPos, currentWord.length - charPos); Matcher discarding = discard.matcher(found); if (discarding.find()) { afterDiscard = discarding.replaceAll(""); } } private boolean nextWord() { currentWordIx++; if (currentWordIx >= words.length) { return false; } currentWord = words[currentWordIx].toCharArray(); termAtt.resizeBuffer(currentWord.length); charPos = 0; softMatcher = subWordBoundary.matcher(words[currentWordIx]); maybeDiscardHyphens(); return true; } } static String getStringReaderContents(Reader reader) { try { reader.mark(Integer.MAX_VALUE); int length = (int) reader.skip(Integer.MAX_VALUE); reader.reset(); char fileContent[] = new char[length]; reader.read(fileContent); reader.reset(); return new String(fileContent); } catch (IOException e) { throw new RuntimeException("Impossible",e); } } @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); return new TokenStreamComponents(source){ private Reader reader; @Override protected void setReader(Reader reader) { this.reader = reader; super.setReader(reader); } @Override public TokenStream getTokenStream() { return new TermCompletionTokenStream(reader); } }; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy