edu.byu.hbll.box.impl.FacetView Maven / Gradle / Ivy
package edu.byu.hbll.box.impl;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import edu.byu.hbll.box.BoxConfigurable;
import edu.byu.hbll.box.BoxDocument;
import edu.byu.hbll.box.BoxQuery;
import edu.byu.hbll.box.ConstructConfig;
import edu.byu.hbll.box.Facet;
import edu.byu.hbll.box.InitConfig;
import edu.byu.hbll.box.QueryResult;
import edu.byu.hbll.box.client.AbstractHttpBoxClient;
import edu.byu.hbll.box.internal.util.JsonUtils;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
/**
* A view that translates a facet value into an id and includes all documents with that facet in the
* final document.
*
* NOTE: when using the builder, you will need to make sure that @box.facets
is
* included in the field projection.
*
*
IMPORTANT: {@link FacetView} has some weaknesses. First, you cannot force the process of
* upstream documents based on a facet. A facet is one-way. Documents get tagged with a facet once
* they've been processed another way. Second, when a document drops its facet value recognized by
* this view, it is impossible to know which group it once belonged to in order to signal the
* reprocess of the group. Therefore it is good practice to occasionally reprocess documents
* dependent on this view. Third, all documents corresponding to a facet id are brought in. If there
* are many many documents per facet id, the resulting facet documents could be too large to fit in
* memory. Set the documentLimit
in this case.
*
* @author Charles Draper
*/
@SuperBuilder(toBuilder = true)
@NoArgsConstructor(access = AccessLevel.PROTECTED)
@AllArgsConstructor(access = AccessLevel.PROTECTED)
public class FacetView extends View {
private String facetName;
@Builder.Default private long documentLimit = BoxQuery.DEFAULT_LIMIT;
private boolean removeFacets;
private long sourceDefaultLimit;
/**
* Creates a new {@link FacetView} with the given box client and facet to use.
*
* @param boxClient the client for communicating with the remote box
* @param facetName the facet group name to use as the id for documents
* @deprecated user builder
*/
@Deprecated
public FacetView(AbstractHttpBoxClient boxClient, String facetName) {
super(boxClient);
this.facetName = facetName;
}
/**
* See {@link BoxConfigurable#postConstruct(ConstructConfig)} and {@link
* BoxConfigurable#postInit(InitConfig)}.
*
* @param constructConfig the construct config to use
* @param initConfig the init config to use
* @deprecated user builder
*/
@Deprecated
public FacetView(ConstructConfig constructConfig, InitConfig initConfig) {
postConstruct(constructConfig);
postInit(initConfig);
}
@Override
public void postConstruct(ConstructConfig config) {
Set fields = new LinkedHashSet<>();
for (JsonNode field : config.getParams().path("fields")) {
fields.add(field.asText());
}
// FacetView requires the metadata facets field, so we add it to the view if needed
if (!JsonUtils.matchesProjection(BoxQuery.METADATA_FIELD_FACETS, fields)) {
fields.add(BoxQuery.METADATA_FIELD_FACETS);
config.getParams().remove("fields");
fields.forEach(f -> config.getParams().withArray("fields").add(f));
removeFacets = true;
}
super.postConstruct(config);
ObjectNode params = config.getParams();
this.facetName = Objects.requireNonNull(params.path("facetName").asText(null));
this.documentLimit = params.path("limit").asLong(BoxQuery.DEFAULT_LIMIT);
this.documentLimit = this.documentLimit == -1 ? Long.MAX_VALUE : this.documentLimit;
}
@Override
public void postInit(InitConfig config) {
super.postInit(config);
this.sourceDefaultLimit = config.getSource().getConfig().getDefaultLimit();
}
@Override
public QueryResult rawFind(BoxQuery query) {
Set facetValues = new LinkedHashSet<>();
long upstreamCursor = query.getCursorOrDefault();
if (query.isHarvestQuery()) {
// harvest the underlying source until we find query.getLimit() number of unique facet values
BoxQuery facetValueQuery =
new BoxQuery(query)
.setLimit(getLimit())
.clearFields()
.addFields(BoxQuery.METADATA_FIELD_FACETS);
long limit = query.getLimitOrDefault();
limit = limit == BoxQuery.UNLIMITED ? sourceDefaultLimit : limit;
boolean more = true;
long nextCursor = query.getCursorOrDefault();
while (more) {
QueryResult response = super.rawFind(facetValueQuery.setCursor(nextCursor));
nextCursor = response.getNextCursor();
more = !response.isEmpty();
for (BoxDocument document : response) {
// must get all the facets from a document even if it pushes us over the limit in order
// for the cursor to work out in subsequent calls
if (facetValues.size() < limit) {
for (Facet facet : document.getFacets()) {
if (facet.getName().equals(facetName)) {
facetValues.add(facet.getValue());
upstreamCursor = document.getCursor().get();
}
}
}
}
more = more && facetValues.size() < limit;
}
} else {
query.getIds().forEach(i -> facetValues.add(i));
}
QueryResult result = new QueryResult();
result.setNextCursor(upstreamCursor);
if (facetValues.isEmpty()) {
return result;
}
// add one if facet values were found to push to next page
result.setNextCursor(upstreamCursor + 1);
Map> documentMap = new LinkedHashMap<>();
// in order to make the request to the View, `documents.` needs to be removed from fields
// starting with `documents.`
Set upstreamFields =
query
.getFields()
.stream()
.filter(f -> !f.equals("documents"))
.filter(f -> f.startsWith("documents."))
.map(f -> f.replaceAll("^documents.", ""))
.collect(Collectors.toSet());
BoxQuery facetQuery = new BoxQuery().addFields(upstreamFields).setLimit(documentLimit);
// need to make sure we get back the facets metadata field
if (!JsonUtils.matchesProjection(BoxQuery.METADATA_FIELD_FACETS, upstreamFields)) {
facetQuery.addField(BoxQuery.METADATA_FIELD_FACETS);
}
// process each facet one at a time
for (String facetValue : facetValues) {
documentMap.put(
facetValue,
super.rawFind(new BoxQuery(facetQuery).addFacet(facetName, facetValue))
.stream()
// take latest if doc appears multiple times due to being updated while collecting
.collect(
Collectors.toMap(
d -> d.getId(), Function.identity(), (x, y) -> y, LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
Set finalFields =
new HashSet<>(upstreamFields.isEmpty() ? getFields() : upstreamFields);
if (removeFacets) {
finalFields.remove(BoxQuery.METADATA_FIELD_FACETS);
}
for (String facetValue : facetValues) {
BoxDocument facetDocument = new BoxDocument(facetValue);
ArrayNode documents = facetDocument.withDocument().withArray("documents");
for (BoxDocument document : documentMap.getOrDefault(facetValue, Collections.emptyList())) {
documents.add(document.toJson(finalFields));
}
if (documents.size() == 0) {
facetDocument.getDocument().removeAll();
facetDocument.setAsDeleted();
}
if (!query.getFields().isEmpty()) {
ObjectNode json = facetDocument.toJson(query.getFields());
json.with("@box")
.put("id", facetDocument.getId())
.put("status", facetDocument.getStatus().toString());
facetDocument = BoxDocument.parse(json);
}
result.add(facetDocument);
}
for (BoxDocument doc : result) {
doc.setCursor(upstreamCursor);
}
return result;
}
// needed for javadoc to succeed (https://stackoverflow.com/a/58809436/1530184)
@java.lang.SuppressWarnings("all")
@lombok.Generated
public abstract static class FacetViewBuilder<
C extends FacetView, B extends FacetViewBuilder>
extends View.ViewBuilder {
@SuppressWarnings("unused")
private B removeFacets(boolean removeFacets) {
return self();
}
@SuppressWarnings("unused")
private B sourceDefaultLimit(long sourceDefaultLimit) {
return self();
}
}
}