com.fnklabs.draenei.analytics.search.ClusteringTfAlgorithm Maven / Gradle / Ivy
package com.fnklabs.draenei.analytics.search;
import com.fnklabs.draenei.analytics.TextUtils;
import com.fnklabs.draenei.analytics.morphology.Language;
import org.jetbrains.annotations.NotNull;
import org.slf4j.LoggerFactory;
import java.beans.BeanInfo;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.Collectors;
/**
* Content based TF clustering algorithm
*/
class ClusteringTfAlgorithm implements ClusteringAlgorithm {
@NotNull
private TextUtils textUtils;
public ClusteringTfAlgorithm(@NotNull TextUtils textUtils) {
this.textUtils = textUtils;
}
@NotNull
@Override
public Set build(@NotNull Object content) {
Map> facetsMap = buildFacets(content);
Set facets = new HashSet<>();
int totalFacets = facetsMap.entrySet()
.stream()
.mapToInt(entry -> entry.getValue().size())
.sum();
facetsMap.forEach((key, values) -> {
double rank = TfIdfUtils.calculateTf(values.size(), totalFacets);
facets.add(new Facet(key, rank, 0));
});
return facets;
}
@Override
public Set build(@NotNull Document document) {
Map> facetsMap = buildFacets(document.getId());
Set facets = new HashSet<>();
int totalFacets = facetsMap.entrySet()
.stream()
.mapToInt(entry -> entry.getValue().size())
.sum();
facetsMap.forEach((key, values) -> {
double rank = TfIdfUtils.calculateTf(values.size(), totalFacets);
facets.add(new Facet(key, rank, document.getId()));
});
return facets;
}
@NotNull
private Map> buildFacets(@NotNull Object content) {
Map> facetsMap = new HashMap<>();
if (!content.getClass().isPrimitive() && !(content instanceof String)) {
try {
BeanInfo beanInfo = Introspector.getBeanInfo(content.getClass());
for (PropertyDescriptor propertyDescriptor : beanInfo.getPropertyDescriptors()) {
Method readMethod = propertyDescriptor.getReadMethod();
String name = propertyDescriptor.getName();
if (name.equals("class")) {
continue;
}
Field field = content.getClass().getDeclaredField(name);
boolean annotationPresent = field.isAnnotationPresent(com.fnklabs.draenei.analytics.search.annotation.Facet.class);
if (annotationPresent) {
Object fieldValue = getFieldValue(content, readMethod);
if (fieldValue != null) {
List values = transformValue(fieldValue);
values.forEach(val -> {
FacetType facetType = new FacetType(field.getName(), val.getClass());
FacetKey key = new FacetKey(facetType, val);
facetsMap.compute(key, new AddFunction());
});
}
}
}
} catch (IntrospectionException | NoSuchFieldException e) {
LoggerFactory.getLogger(getClass()).warn("Can't read value", e);
}
} else {
transformValue(content).forEach(val -> {
FacetKey key = new FacetKey(new FacetType("primitive", val.getClass()), val);
facetsMap.compute(key, new AddFunction());
}
);
}
return facetsMap;
}
/**
* Try to transform field value to simple types
*
* @param value
*
* @return
*/
private List transformValue(@NotNull Object value) {
List values = new ArrayList<>();
if (value instanceof String) {
List build = build((String) value);
values.addAll(build);
} else if (value instanceof Collection) {
Collection collection = (Collection) value;
List collect = collection.stream()
.flatMap(item -> {
return transformValue(item).stream();
})
.collect(Collectors.toList());
values.addAll(collect);
} else {
values.add((Serializable) value);
}
return values;
}
private Object getFieldValue(@NotNull Object content, Method field) {
try {
return field.invoke(content);
} catch (InvocationTargetException | IllegalAccessException e) {
LoggerFactory.getLogger(ClusteringTfAlgorithm.class).warn("Can't get field value", e);
}
return null;
}
/**
* Build String facet from text
*
* @param text Text content
*
* @return Word facets
*/
private List build(@NotNull String text) {
List words = textUtils.extractWords(text, Language.RU);
List wordList = words.stream()
.flatMap(word -> {
return textUtils.getNormalForms(word.toLowerCase(), Language.RU)
.stream()
.filter(element -> textUtils.isNormalWord(element, Language.RU));
})
.collect(Collectors.toList());
return wordList;
}
private static class AddFunction implements BiFunction, List> {
@Override
public List apply(FacetKey key, List keys) {
if (keys == null) {
keys = new ArrayList<>();
}
keys.add(key);
return keys;
}
}
}