io.zulia.server.index.ShardDocumentIndexer Maven / Gradle / Ivy
package io.zulia.server.index;
import info.debatty.java.lsh.SuperBit;
import io.zulia.ZuliaConstants;
import io.zulia.message.ZuliaBase;
import io.zulia.message.ZuliaIndex;
import io.zulia.server.config.ServerIndexConfig;
import io.zulia.server.index.field.BooleanFieldIndexer;
import io.zulia.server.index.field.DateFieldIndexer;
import io.zulia.server.index.field.DoubleFieldIndexer;
import io.zulia.server.index.field.FieldTypeUtil;
import io.zulia.server.index.field.FloatFieldIndexer;
import io.zulia.server.index.field.IntFieldIndexer;
import io.zulia.server.index.field.LongFieldIndexer;
import io.zulia.server.index.field.StringFieldIndexer;
import io.zulia.util.ResultHelper;
import io.zulia.util.ZuliaUtil;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.FacetField;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.NumericUtils;
import java.time.LocalDate;
import java.time.ZoneId;
import java.util.Date;
import java.util.List;
public class ShardDocumentIndexer {
private final ServerIndexConfig indexConfig;
public ShardDocumentIndexer(ServerIndexConfig indexConfig) {
this.indexConfig = indexConfig;
}
public Document getIndexDocument(String uniqueId, long timestamp, org.bson.Document mongoDocument, List metadataList) throws Exception {
Document luceneDocument = new Document();
addStoredFieldsForDocument(mongoDocument, luceneDocument);
luceneDocument.add(new StringField(ZuliaConstants.ID_FIELD, uniqueId, Field.Store.YES));
luceneDocument.add(new LongPoint(ZuliaConstants.TIMESTAMP_FIELD, timestamp));
luceneDocument.add(new StoredField(ZuliaConstants.TIMESTAMP_FIELD, timestamp));
luceneDocument.add(new StoredField(ZuliaConstants.STORED_DOC_FIELD, new BytesRef(ZuliaUtil.mongoDocumentToByteArray(mongoDocument))));
org.bson.Document metadataMongoDoc = new org.bson.Document();
for (ZuliaBase.Metadata metadata : metadataList) {
metadataMongoDoc.put(metadata.getKey(), metadata.getValue());
}
luceneDocument.add(new StoredField(ZuliaConstants.STORED_META_FIELD, new BytesRef(ZuliaUtil.mongoDocumentToByteArray(metadataMongoDoc))));
return luceneDocument;
}
private void addStoredFieldsForDocument(org.bson.Document mongoDocument, Document luceneDocument) throws Exception {
for (String storedFieldName : indexConfig.getIndexedStoredFieldNames()) {
ZuliaIndex.FieldConfig fc = indexConfig.getFieldConfig(storedFieldName);
if (fc != null) {
ZuliaIndex.FieldConfig.FieldType fieldType = fc.getFieldType();
Object o = ResultHelper.getValueFromMongoDocument(mongoDocument, storedFieldName);
if (o != null) {
handleFacetsForStoredField(luceneDocument, fc, o);
handleSortForStoredField(luceneDocument, storedFieldName, fc, o);
handleIndexingForStoredField(luceneDocument, storedFieldName, fc, fieldType, o);
handleProjectForStoredField(luceneDocument, fc, o);
}
}
}
}
private void handleProjectForStoredField(Document luceneDocument, ZuliaIndex.FieldConfig fc, Object o) throws Exception {
for (ZuliaIndex.ProjectAs projectAs : fc.getProjectAsList()) {
if (projectAs.hasSuperbit()) {
if (o instanceof List) {
List values = (List) o;
double vec[] = new double[values.size()];
int i = 0;
for (Number value : values) {
vec[i++] = value.doubleValue();
}
SuperBit superBitForField = indexConfig.getSuperBitForField(projectAs.getField());
boolean[] signature = superBitForField.signature(vec);
int j = 0;
for (boolean s : signature) {
StringFieldIndexer.INSTANCE.index(luceneDocument, projectAs.getField(), s ? "1" : "0",
ZuliaConstants.SUPERBIT_PREFIX + "." + projectAs.getField() + "." + j);
j++;
}
}
else {
throw new Exception("Expecting a list for superbit field <" + projectAs.getField() + ">");
}
}
}
}
private void handleIndexingForStoredField(Document luceneDocument, String storedFieldName, ZuliaIndex.FieldConfig fc,
ZuliaIndex.FieldConfig.FieldType fieldType, Object o) throws Exception {
for (ZuliaIndex.IndexAs indexAs : fc.getIndexAsList()) {
String indexedFieldName = indexAs.getIndexFieldName();
luceneDocument.add(new StringField(ZuliaConstants.FIELDS_LIST_FIELD, indexedFieldName, Field.Store.NO));
if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_INT.equals(fieldType)) {
IntFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_LONG.equals(fieldType)) {
LongFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_FLOAT.equals(fieldType)) {
FloatFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_DOUBLE.equals(fieldType)) {
DoubleFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.DATE.equals(fieldType)) {
DateFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.BOOL.equals(fieldType)) {
BooleanFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else if (ZuliaIndex.FieldConfig.FieldType.STRING.equals(fieldType)) {
StringFieldIndexer.INSTANCE.index(luceneDocument, storedFieldName, o, indexedFieldName);
}
else {
throw new RuntimeException("Unsupported field type <" + fieldType + ">");
}
}
}
private void handleSortForStoredField(Document d, String storedFieldName, ZuliaIndex.FieldConfig fc, Object o) {
ZuliaIndex.FieldConfig.FieldType fieldType = fc.getFieldType();
for (ZuliaIndex.SortAs sortAs : fc.getSortAsList()) {
String sortFieldName = sortAs.getSortFieldName();
if (FieldTypeUtil.isNumericOrDateFieldType(fieldType)) {
ZuliaUtil.handleLists(o, obj -> {
if (ZuliaIndex.FieldConfig.FieldType.DATE.equals(fieldType)) {
if (obj instanceof Date) {
Date date = (Date) obj;
SortedNumericDocValuesField docValue = new SortedNumericDocValuesField(sortFieldName, date.getTime());
d.add(docValue);
}
else {
throw new RuntimeException(
"Expecting date for document field <" + storedFieldName + "> / sort field <" + sortFieldName + ">, found <" + o.getClass()
+ ">");
}
}
else {
if (obj instanceof Number) {
Number number = (Number) obj;
SortedNumericDocValuesField docValue = null;
if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_INT.equals(fieldType)) {
docValue = new SortedNumericDocValuesField(sortFieldName, number.intValue());
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_LONG.equals(fieldType)) {
docValue = new SortedNumericDocValuesField(sortFieldName, number.longValue());
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_FLOAT.equals(fieldType)) {
docValue = new SortedNumericDocValuesField(sortFieldName, NumericUtils.floatToSortableInt(number.floatValue()));
}
else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_DOUBLE.equals(fieldType)) {
docValue = new SortedNumericDocValuesField(sortFieldName, NumericUtils.doubleToSortableLong(number.doubleValue()));
}
else {
throw new RuntimeException(
"Not handled numeric field type <" + fieldType + "> for document field <" + storedFieldName + "> / sort field <"
+ sortFieldName + ">");
}
d.add(docValue);
}
else {
throw new RuntimeException(
"Expecting number for document field <" + storedFieldName + "> / sort field <" + sortFieldName + ">, found <" + o.getClass()
+ ">");
}
}
});
}
else if (ZuliaIndex.FieldConfig.FieldType.BOOL.equals(fieldType)) {
ZuliaUtil.handleLists(o, obj -> {
if (obj instanceof Boolean) {
String text = obj.toString();
SortedSetDocValuesField docValue = new SortedSetDocValuesField(sortFieldName, new BytesRef(text));
d.add(docValue);
}
else {
throw new RuntimeException(
"Expecting boolean for document field <" + storedFieldName + "> / sort field <" + sortFieldName + ">, found <" + o.getClass()
+ ">");
}
});
}
else if (ZuliaIndex.FieldConfig.FieldType.STRING.equals(fieldType)) {
ZuliaUtil.handleLists(o, obj -> {
String text = o.toString();
ZuliaIndex.SortAs.StringHandling stringHandling = sortAs.getStringHandling();
if (ZuliaIndex.SortAs.StringHandling.STANDARD.equals(stringHandling)) {
//no op
}
else if (ZuliaIndex.SortAs.StringHandling.LOWERCASE.equals(stringHandling)) {
text = text.toLowerCase();
}
else if (ZuliaIndex.SortAs.StringHandling.FOLDING.equals(stringHandling)) {
text = getFoldedString(text);
}
else if (ZuliaIndex.SortAs.StringHandling.LOWERCASE_FOLDING.equals(stringHandling)) {
text = getFoldedString(text).toLowerCase();
}
else {
throw new RuntimeException(
"Not handled string handling <" + stringHandling + "> for document field <" + storedFieldName + "> / sort field <"
+ sortFieldName + ">");
}
SortedSetDocValuesField docValue = new SortedSetDocValuesField(sortFieldName, new BytesRef(text));
d.add(docValue);
});
}
else {
throw new RuntimeException(
"Not handled field type <" + fieldType + "> for document field <" + storedFieldName + "> / sort field <" + sortFieldName + ">");
}
}
}
private void handleFacetsForStoredField(Document doc, ZuliaIndex.FieldConfig fc, Object o) throws Exception {
for (ZuliaIndex.FacetAs fa : fc.getFacetAsList()) {
String facetName = fa.getFacetName();
if (ZuliaIndex.FieldConfig.FieldType.DATE.equals(fc.getFieldType())) {
ZuliaIndex.FacetAs.DateHandling dateHandling = fa.getDateHandling();
ZuliaUtil.handleLists(o, obj -> {
if (obj instanceof Date) {
LocalDate localDate = ((Date) (obj)).toInstant().atZone(ZoneId.of("UTC")).toLocalDate();
if (ZuliaIndex.FacetAs.DateHandling.DATE_YYYYMMDD.equals(dateHandling)) {
String date = String.format("%02d%02d%02d", localDate.getYear(), localDate.getMonthValue(), localDate.getDayOfMonth());
addFacet(doc, facetName, date);
}
else if (ZuliaIndex.FacetAs.DateHandling.DATE_YYYY_MM_DD.equals(dateHandling)) {
String date = String.format("%02d-%02d-%02d", localDate.getYear(), localDate.getMonthValue(), localDate.getDayOfMonth());
addFacet(doc, facetName, date);
}
else {
throw new RuntimeException("Not handled date handling <" + dateHandling + "> for facet <" + fa.getFacetName() + ">");
}
}
else {
throw new RuntimeException("Cannot facet date for document field <" + fc.getStoredFieldName() + "> / facet <" + fa.getFacetName()
+ ">: excepted Date or Collection of Date, found <" + o.getClass().getSimpleName() + ">");
}
});
}
else {
ZuliaUtil.handleLists(o, obj -> {
String string = obj.toString();
addFacet(doc, facetName, string);
});
}
}
}
private void addFacet(Document doc, String facetName, String value) {
if (!value.isEmpty()) {
doc.add(new FacetField(facetName, value));
//doc.add(new SortedSetDocValuesFacetField(facetName, value));
doc.add(new StringField(FacetsConfig.DEFAULT_INDEX_FIELD_NAME + "." + facetName, new BytesRef(value), Field.Store.NO));
}
}
private static String getFoldedString(String text) {
boolean needsFolding = false;
for (int pos = 0; pos < text.length(); ++pos) {
final char c = text.charAt(pos);
if (c >= '\u0080') {
needsFolding = true;
break;
}
}
if (!needsFolding) {
return text;
}
char[] textChar = text.toCharArray();
char[] output = new char[textChar.length * 4];
int outputPos = ASCIIFoldingFilter.foldToASCII(textChar, 0, output, 0, textChar.length);
text = new String(output, 0, outputPos);
return text;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy