All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.zulia.server.index.ShardTermsHandler Maven / Gradle / Ivy

package io.zulia.server.index;

import io.zulia.message.ZuliaBase;
import io.zulia.message.ZuliaServiceOuterClass.GetTermsRequest;
import io.zulia.message.ZuliaServiceOuterClass.GetTermsResponse;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;

public class ShardTermsHandler {

	private final DirectoryReader indexReader;

	public ShardTermsHandler(DirectoryReader indexReader) {
		this.indexReader = indexReader;
	}

	public GetTermsResponse handleShardTerms(GetTermsRequest request) throws IOException {
		GetTermsResponse.Builder builder = GetTermsResponse.newBuilder();

		String fieldName = request.getFieldName();

		SortedMap termsMap = new TreeMap<>();

		if (request.getIncludeTermCount() > 0) {

			Set includeTerms = new TreeSet<>(request.getIncludeTermList());
			List termBytesList = new ArrayList<>();
			for (String term : includeTerms) {
				BytesRef termBytes = new BytesRef(term);
				termBytesList.add(termBytes);
			}

			for (LeafReaderContext subReaderContext : indexReader.leaves()) {
				Terms terms = subReaderContext.reader().terms(fieldName);

				if (terms != null) {

					TermsEnum termsEnum = terms.iterator();
					for (BytesRef termBytes : termBytesList) {
						if (termsEnum.seekExact(termBytes)) {
							BytesRef text = termsEnum.term();
							handleTerm(termsMap, termsEnum, text, null, null);
						}

					}
				}

			}
		}
		else {

			AttributeSource atts = null;
			MaxNonCompetitiveBoostAttribute maxBoostAtt = null;
			boolean hasFuzzyTerm = request.hasFuzzyTerm();
			if (hasFuzzyTerm) {
				atts = new AttributeSource();
				maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
			}

			BytesRef startTermBytes;
			BytesRef endTermBytes = null;

			if (!request.getStartTerm().isEmpty()) {
				startTermBytes = new BytesRef(request.getStartTerm());
			}
			else {
				startTermBytes = new BytesRef("");
			}

			if (!request.getEndTerm().isEmpty()) {
				endTermBytes = new BytesRef(request.getEndTerm());
			}

			Pattern termFilter = null;
			if (!request.getTermFilter().isEmpty()) {
				termFilter = Pattern.compile(request.getTermFilter());
			}

			Pattern termMatch = null;
			if (!request.getTermMatch().isEmpty()) {
				termMatch = Pattern.compile(request.getTermMatch());
			}

			for (LeafReaderContext subReaderContext : indexReader.leaves()) {
				Terms terms = subReaderContext.reader().terms(fieldName);

				if (terms != null) {

					if (hasFuzzyTerm) {
						ZuliaBase.FuzzyTerm fuzzyTerm = request.getFuzzyTerm();
						FuzzyTermsEnum termsEnum = new FuzzyTermsEnum(terms, atts, new Term(fieldName, fuzzyTerm.getTerm()), fuzzyTerm.getEditDistance(),
								fuzzyTerm.getPrefixLength(), !fuzzyTerm.getNoTranspositions());
						BytesRef text = termsEnum.term();

						handleTerm(termsMap, termsEnum, text, termFilter, termMatch);

						while ((text = termsEnum.next()) != null) {
							handleTerm(termsMap, termsEnum, text, termFilter, termMatch);
						}

					}
					else {
						TermsEnum termsEnum = terms.iterator();
						TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(startTermBytes);

						if (!seekStatus.equals(TermsEnum.SeekStatus.END)) {
							BytesRef text = termsEnum.term();

							if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) {
								handleTerm(termsMap, termsEnum, text, termFilter, termMatch);

								while ((text = termsEnum.next()) != null) {

									if (endTermBytes == null || (text.compareTo(endTermBytes) < 0)) {
										handleTerm(termsMap, termsEnum, text, termFilter, termMatch);
									}
									else {
										break;
									}
								}
							}
						}
					}

				}

			}
		}

		for (ZuliaBase.Term.Builder termBuilder : termsMap.values()) {
			builder.addTerm(termBuilder.build());
		}

		return builder.build();
	}

	private void handleTerm(SortedMap termsMap, TermsEnum termsEnum, BytesRef text, Pattern termFilter, Pattern termMatch)
			throws IOException {

		String textStr = text.utf8ToString();
		if (termFilter != null || termMatch != null) {

			if (termFilter != null) {
				if (termFilter.matcher(textStr).matches()) {
					return;
				}
			}

			if (termMatch != null) {
				if (!termMatch.matcher(textStr).matches()) {
					return;
				}
			}
		}

		if (!termsMap.containsKey(textStr)) {
			termsMap.put(textStr, ZuliaBase.Term.newBuilder().setValue(textStr).setDocFreq(0).setTermFreq(0));
		}
		ZuliaBase.Term.Builder builder = termsMap.get(textStr);
		builder.setDocFreq(builder.getDocFreq() + termsEnum.docFreq());
		builder.setTermFreq(builder.getTermFreq() + termsEnum.totalTermFreq());
		BoostAttribute boostAttribute = termsEnum.attributes().getAttribute(BoostAttribute.class);
		if (boostAttribute != null) {
			builder.setScore(boostAttribute.getBoost());
		}

	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy