Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.codelibs.elasticsearch.minhash.index.mapper.MinHashFieldMapper Maven / Gradle / Ivy
package org.codelibs.elasticsearch.minhash.index.mapper;
import static org.elasticsearch.common.xcontent.support.XContentMapValues.isArray;
import static org.elasticsearch.common.xcontent.support.XContentMapValues.nodeStringValue;
import java.io.IOException;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.BytesRef;
import org.codelibs.minhash.MinHash;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData;
import org.elasticsearch.index.mapper.ContentPath;
import org.elasticsearch.index.mapper.CustomDocValuesField;
import org.elasticsearch.index.mapper.FieldAliasMapper;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.FieldNamesFieldMapper;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.Mapper;
import org.elasticsearch.index.mapper.Mapper.TypeParser.ParserContext;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.SourceValueFetcher;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.index.mapper.ValueFetcher;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.index.query.QueryShardException;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
import org.elasticsearch.search.lookup.SearchLookup;
import com.carrotsearch.hppc.ObjectArrayList;
public class MinHashFieldMapper extends FieldMapper {
public static final String CONTENT_TYPE = "minhash";
private static MinHashFieldMapper toType(FieldMapper in) {
return (MinHashFieldMapper) in;
}
public static class Builder extends FieldMapper.Builder {
private final Parameter stored = Parameter.boolParam("store", false, m -> toType(m).stored, true);
private final Parameter hasDocValues = Parameter.boolParam("doc_values", false, m -> toType(m).hasDocValues, false);
private final Parameter nullValue = Parameter.stringParam("null_value", false, m->toType(m).nullValue, null);
private final Parameter> meta = Parameter.metaParam();
private final Parameter minhashAnalyzer = Parameter
.stringParam("minhash_analyzer", true, m -> {
NamedAnalyzer minhashAnalyzer = toType(m).minhashAnalyzer;
if (minhashAnalyzer != null) {
return minhashAnalyzer.name();
}
return "standard";
}, "standard");
private final Parameter copyBitsTo = new Parameter<>(
"copy_bits_to", true, () -> new String[0],
(n, c, o) -> parseCopyBitsFields(o), m -> {
List fieldList = toType(m).copyBitsTo
.copyBitsToFields();
return fieldList.toArray(new String[fieldList.size()]);
});
private ParserContext parserContext;
private NamedAnalyzer mergedAnalyzer;
public Builder(String name) {
this(name, null, false);
}
public Builder(String name, ParserContext parserContext, boolean hasDocValues) {
super(name);
this.parserContext = parserContext;
this.hasDocValues.setValue(hasDocValues);
}
@Override
public List> getParameters() {
return Arrays.asList(meta, stored, hasDocValues, nullValue, minhashAnalyzer, copyBitsTo);
}
@Override
public Builder init(FieldMapper initializer) {
super.init(initializer);
return this;
}
public Builder minhashAnalyzer(NamedAnalyzer minhashAnalyzer) {
this.mergedAnalyzer = minhashAnalyzer;
return this;
}
private NamedAnalyzer minhashAnalyzer() {
if (mergedAnalyzer != null) {
return mergedAnalyzer;
}
if (parserContext != null) {
return parserContext.getIndexAnalyzers()
.get(minhashAnalyzer.getValue());
}
return null;
}
private CopyBitsTo copyBitsTo() {
final CopyBitsTo.Builder copyToBuilder = new CopyBitsTo.Builder();
for (final String value : copyBitsTo.getValue()) {
copyToBuilder.add(value);
}
return copyToBuilder.build();
}
@Override
public MinHashFieldMapper build(ContentPath contentPath) {
return new MinHashFieldMapper(name,
new MinHashFieldType(buildFullName(contentPath), true,
hasDocValues.getValue(), meta.getValue()),
multiFieldsBuilder.build(this, contentPath), copyTo.build(),
this, minhashAnalyzer(), copyBitsTo());
}
}
public static class TypeParser implements Mapper.TypeParser {
@Override
public MinHashFieldMapper.Builder parse(final String name, final Map node,
final ParserContext parserContext) throws MapperParsingException {
final MinHashFieldMapper.Builder builder = new MinHashFieldMapper.Builder(
name, parserContext, false);
builder.parse(name, parserContext, node);
return builder;
}
}
public static String[] parseCopyBitsFields(final Object propNode) {
if (isArray(propNode)) {
@SuppressWarnings("unchecked")
final List nodeList = (List) propNode;
return nodeList.stream().map(o -> nodeStringValue(o, null))
.filter(s -> s != null).toArray(n -> new String[n]);
} else {
return new String[] { nodeStringValue(propNode, null) };
}
}
static final class MinHashFieldType extends MappedFieldType {
public MinHashFieldType(String name, boolean isStored, boolean hasDocValues, Map meta) {
super(name, false, isStored, hasDocValues, TextSearchInfo.NONE, meta);
}
public MinHashFieldType(String name) {
this(name, true, true, Collections.emptyMap());
}
@Override
public String typeName() {
return CONTENT_TYPE;
}
@Override
public ValueFetcher valueFetcher(QueryShardContext context, String format) {
return SourceValueFetcher.identity(name(), context, format);
}
@Override
public DocValueFormat docValueFormat(String format, ZoneId timeZone) {
return DocValueFormat.BINARY;
}
@Override
public BytesReference valueForDisplay(final Object value) {
if (value == null) {
return null;
}
BytesReference bytes;
if (value instanceof BytesRef) {
bytes = new BytesArray((BytesRef) value);
} else if (value instanceof BytesReference) {
bytes = (BytesReference) value;
} else if (value instanceof byte[]) {
bytes = new BytesArray((byte[]) value);
} else {
bytes = new BytesArray(
Base64.getDecoder().decode(value.toString()));
}
return bytes;
}
@Override
public IndexFieldData.Builder fielddataBuilder(String fullyQualifiedIndexName, Supplier searchLookup) {
failIfNoDocValues();
return new SortedSetOrdinalsIndexFieldData.Builder(name(), CoreValuesSourceType.BYTES);
}
@Override
public Query existsQuery(QueryShardContext context) {
if (hasDocValues()) {
return new DocValuesFieldExistsQuery(name());
} else {
return new TermQuery(new Term(FieldNamesFieldMapper.NAME, name()));
}
}
@Override
public Query termQuery(final Object value, final QueryShardContext context) {
throw new QueryShardException(context,
"MinHash fields do not support searching");
}
}
private final boolean stored;
private final boolean hasDocValues;
private final String nullValue;
private NamedAnalyzer minhashAnalyzer;
private CopyBitsTo copyBitsTo;
protected MinHashFieldMapper(String simpleName, MappedFieldType mappedFieldType,
MultiFields multiFields, CopyTo copyTo, Builder builder,
NamedAnalyzer minhashAnalyzer, CopyBitsTo copyBitsTo) {
super(simpleName, mappedFieldType, multiFields, copyTo);
this.stored = builder.stored.getValue();
this.hasDocValues = builder.hasDocValues.getValue();
this.nullValue = builder.nullValue.getValue();
this.minhashAnalyzer = minhashAnalyzer;
this.copyBitsTo = copyBitsTo;
}
@Override
protected void parseCreateField(final ParseContext context) throws IOException {
if (stored == false && hasDocValues == false) {
return;
}
String value;
if (context.externalValueSet()) {
value = context.externalValue().toString();
} else {
final XContentParser parser = context.parser();
if (parser.currentToken() == XContentParser.Token.VALUE_NULL) {
value = nullValue;
} else {
value = parser.textOrNull();
}
}
if (value == null) {
return;
}
final byte[] minhashValue = MinHash.calculate(minhashAnalyzer, value);
if (stored) {
context.doc().add(new StoredField(fieldType().name(), minhashValue));
}
if (hasDocValues) {
CustomMinHashDocValuesField field = (CustomMinHashDocValuesField) context
.doc().getByKey(fieldType().name());
if (field == null) {
field = new CustomMinHashDocValuesField(fieldType().name(),
minhashValue);
context.doc().addWithKey(fieldType().name(), field);
} else {
field.add(minhashValue);
}
} else {
// Only add an entry to the field names field if the field is stored
// but has no doc values so exists query will work on a field with
// no doc values
createFieldNamesField(context);
}
if (!copyBitsTo.copyBitsToFields().isEmpty()) {
parseCopyBitsFields(
context.createExternalValueContext(
MinHash.toBinaryString(minhashValue)),
copyBitsTo.copyBitsToFields);
}
}
/** Creates instances of the fields that the current field should be copied to */
private static void parseCopyBitsFields(ParseContext context,
final List copyToFields) throws IOException {
if (!context.isWithinCopyTo() && copyToFields.isEmpty() == false) {
context = context.createCopyToContext();
for (final String field : copyToFields) {
// In case of a hierarchy of nested documents, we need to figure out
// which document the field should go to
ParseContext.Document targetDoc = null;
for (ParseContext.Document doc = context
.doc(); doc != null; doc = doc.getParent()) {
if (field.startsWith(doc.getPrefix())) {
targetDoc = doc;
break;
}
}
assert targetDoc != null;
final ParseContext copyToContext;
if (targetDoc == context.doc()) {
copyToContext = context;
} else {
copyToContext = context.switchDoc(targetDoc);
}
parseCopy(field, copyToContext);
}
}
}
/** Creates an copy of the current field with given field name and boost */
private static void parseCopy(final String field, final ParseContext context)
throws IOException {
Mapper mapper = context.docMapper().mappers().getMapper(field);
if (mapper != null) {
if (mapper instanceof FieldMapper) {
((FieldMapper) mapper).parse(context);
} else if (mapper instanceof FieldAliasMapper) {
throw new IllegalArgumentException("Cannot copy to a field alias [" + mapper.name() + "].");
} else {
throw new IllegalStateException("The provided mapper [" + mapper.name() +
"] has an unrecognized type [" + mapper.getClass().getSimpleName() + "].");
}
}
}
@Override
public FieldMapper.Builder getMergeBuilder() {
Builder builder = new MinHashFieldMapper.Builder(simpleName())
.init(this);
builder.minhashAnalyzer(this.minhashAnalyzer);
return builder;
}
@Override
protected String contentType() {
return CONTENT_TYPE;
}
public static class CustomMinHashDocValuesField extends CustomDocValuesField {
private final ObjectArrayList bytesList;
private int totalSize = 0;
public CustomMinHashDocValuesField(String name, byte[] bytes) {
super(name);
bytesList = new ObjectArrayList<>();
add(bytes);
}
public void add(byte[] bytes) {
bytesList.add(bytes);
totalSize += bytes.length;
}
@Override
public BytesRef binaryValue() {
try {
CollectionUtils.sortAndDedup(bytesList);
int size = bytesList.size();
final byte[] bytes = new byte[totalSize + (size + 1) * 5];
ByteArrayDataOutput out = new ByteArrayDataOutput(bytes);
out.writeVInt(size); // write total number of values
for (int i = 0; i < size; i ++) {
final byte[] value = bytesList.get(i);
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value, 0, valueLength);
}
return new BytesRef(bytes, 0, out.getPosition());
} catch (IOException e) {
throw new ElasticsearchException("Failed to get MinHash value", e);
}
}
}
public static class CopyBitsTo {
private final List copyBitsToFields;
private CopyBitsTo(final List copyBitsToFields) {
this.copyBitsToFields = copyBitsToFields;
}
public XContentBuilder toXContent(final XContentBuilder builder,
final Params params) throws IOException {
if (!copyBitsToFields.isEmpty()) {
builder.startArray("copy_bits_to");
for (final String field : copyBitsToFields) {
builder.value(field);
}
builder.endArray();
}
return builder;
}
public static class Builder {
private final List copyBitsToBuilders = new ArrayList<>();
public Builder add(final String field) {
copyBitsToBuilders.add(field);
return this;
}
public CopyBitsTo build() {
return new CopyBitsTo(
Collections.unmodifiableList(copyBitsToBuilders));
}
}
public List copyBitsToFields() {
return copyBitsToFields;
}
}
}