org.apache.solr.update.DocumentBuilder Maven / Gradle / Ivy
Show all versions of solr-core Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.update;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocumentBase;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.schema.CopyField;
import org.apache.solr.schema.DenseVectorField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
/** Builds a Lucene {@link Document} from a {@link SolrInputDocument}. */
public class DocumentBuilder {
// accessible only for tests
static int MIN_LENGTH_TO_MOVE_LAST =
Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4 * 1024); // internal setting
static int MAX_VALUES_AS_STRING_LENGTH = 256;
/**
* Add a field value to a given document.
*
* @param doc Document that the field needs to be added to
* @param field The schema field object for the field
* @param val The value for the field to be added
* @param forInPlaceUpdate Whether the field is to be added for in-place update. If true, only
* numeric docValues based fields are added to the document. This can be true when
* constructing a Lucene document for writing an in-place update, and we don't need presence
* of non-updatable fields (non NDV) in such a document.
*/
private static void addField(
Document doc, SchemaField field, Object val, boolean forInPlaceUpdate) {
if (val instanceof IndexableField) {
if (forInPlaceUpdate) {
assert val instanceof NumericDocValuesField
: "Expected in-place update to be done on" + " NDV fields only.";
}
doc.add((IndexableField) val);
return;
}
for (IndexableField f : field.getType().createFields(field, val)) {
if (f != null) { // null fields are not added
// HACK: workaround for SOLR-9809
// even though at this point in the code we know the field is single valued and DV only
// TrieField.createFields() may still return (usless) IndexableField instances that are not
// NumericDocValuesField instances.
//
// once SOLR-9809 is resolved, we should be able to replace this conditional with...
// assert f instanceof NumericDocValuesField
if (forInPlaceUpdate) {
if (f instanceof NumericDocValuesField) {
doc.add(f);
}
} else {
doc.add(f);
}
}
}
}
private static String getID(SolrInputDocument doc, IndexSchema schema) {
String id = "";
SchemaField sf = schema.getUniqueKeyField();
if (sf != null) {
id = "[doc=" + doc.getFieldValue(sf.getName()) + "] ";
}
return id;
}
/**
* @see DocumentBuilder#toDocument(SolrInputDocument, IndexSchema, boolean, boolean)
*/
public static Document toDocument(SolrInputDocument doc, IndexSchema schema) {
return toDocument(doc, schema, false, true);
}
/**
* Convert a SolrInputDocument to a lucene Document.
*
* This function should go elsewhere. This builds the Document without an extra Map<>
* checking for multiple values. For more discussion, see:
* http://www.nabble.com/Re%3A-svn-commit%3A-r547493---in--lucene-solr-trunk%3A-.--src-java-org-apache-solr-common--src-java-org-apache-solr-schema--src-java-org-apache-solr-update--src-test-org-apache-solr-common--tf3931539.html
*
*
TODO: /!\ NOTE /!\ This semantics of this function are still in flux. Something somewhere
* needs to be able to fill up a SolrDocument from a lucene document - this is one place that may
* happen. It may also be moved to an independent function
*
* @since solr 1.3
* @param doc SolrInputDocument from which the document has to be built
* @param schema Schema instance
* @param forInPlaceUpdate Whether the output document would be used for an in-place update or
* not. When this is true, default fields values and copy fields targets are not populated.
* @param ignoreNestedDocs if nested child documents should be ignored. If false then an exception
* will be thrown.
* @return Built Lucene document
*/
public static Document toDocument(
SolrInputDocument doc,
IndexSchema schema,
boolean forInPlaceUpdate,
boolean ignoreNestedDocs) {
if (!ignoreNestedDocs && doc.hasChildDocuments()) {
throw unexpectedNestedDocException(schema, forInPlaceUpdate);
}
final SchemaField uniqueKeyField = schema.getUniqueKeyField();
final String uniqueKeyFieldName = null == uniqueKeyField ? null : uniqueKeyField.getName();
Document out = new Document();
Set usedFields = new HashSet<>();
// Load fields from SolrDocument to Document
for (SolrInputField field : doc) {
// when in-place update, don't process the id & _root_; they won't change
if (forInPlaceUpdate) {
if (field.getName().equals(uniqueKeyFieldName)
|| field.getName().equals(IndexSchema.ROOT_FIELD_NAME)) {
continue;
}
}
if (field.getFirstValue() instanceof SolrDocumentBase) {
if (ignoreNestedDocs) {
continue;
}
throw unexpectedNestedDocException(schema, forInPlaceUpdate);
}
String name = field.getName();
SchemaField sfield = schema.getFieldOrNull(name);
boolean used = false;
// Make sure it has the correct number
if (sfield != null
&& !sfield.multiValued()
&& field.getValueCount() > 1
&& !(sfield.getType() instanceof DenseVectorField)) {
// Ensure we do not flood the logs with extremely long values
String fieldValue = field.getValue().toString();
if (fieldValue.length() > MAX_VALUES_AS_STRING_LENGTH) {
assert fieldValue.endsWith("]");
fieldValue = fieldValue.substring(0, MAX_VALUES_AS_STRING_LENGTH - 4) + "...]";
}
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"ERROR: "
+ getID(doc, schema)
+ "multiple values encountered for non multiValued field "
+ sfield.getName()
+ ": "
+ fieldValue);
}
List copyFields = schema.getCopyFieldsList(name);
if (copyFields.size() == 0) copyFields = null;
// load each field value
boolean hasField = false;
try {
if (sfield != null && sfield.getType() instanceof DenseVectorField) {
Object vectorValue = field.getValue();
if (vectorValue != null) {
hasField = true;
used = addOriginalField(vectorValue, sfield, forInPlaceUpdate, out, usedFields);
// Check if we should copy this field value to any other fields.
// This could happen whether it is explicit or not.
if (copyFields != null) {
used |=
addCopyFields(
schema,
vectorValue,
sfield.getType(),
copyFields,
forInPlaceUpdate,
uniqueKeyFieldName,
out,
usedFields);
}
}
} else {
Iterator it = field.iterator();
while (it.hasNext()) {
Object v = it.next();
if (v == null) {
continue;
}
hasField = true;
if (sfield != null) {
used = addOriginalField(v, sfield, forInPlaceUpdate, out, usedFields);
}
// Check if we should copy this field value to any other fields.
// This could happen whether it is explicit or not.
if (copyFields != null) {
used |=
addCopyFields(
schema,
v,
sfield.getType(),
copyFields,
forInPlaceUpdate,
uniqueKeyFieldName,
out,
usedFields);
}
}
}
} catch (SolrException ex) {
throw new SolrException(
SolrException.ErrorCode.getErrorCode(ex.code()),
"ERROR: "
+ getID(doc, schema)
+ "Error adding field '"
+ field.getName()
+ "'='"
+ field.getValue()
+ "' msg="
+ ex.getMessage(),
ex);
} catch (Exception ex) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"ERROR: "
+ getID(doc, schema)
+ "Error adding field '"
+ field.getName()
+ "'='"
+ field.getValue()
+ "' msg="
+ ex.getMessage(),
ex);
}
// make sure the field was used somehow...
if (!used && hasField) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"ERROR: " + getID(doc, schema) + "unknown field '" + name + "'");
}
}
// Now validate required fields or add default values
// fields with default values are defacto 'required'
// Note: We don't need to add required fields if this document is to be used for
// in-place updates, since this validation and population of required fields would've happened
// during the full indexing initially.
if (!forInPlaceUpdate) {
for (SchemaField field : schema.getRequiredFields()) {
if (out.getField(field.getName()) == null) {
if (field.getDefaultValue() != null) {
addField(out, field, field.getDefaultValue(), false);
} else {
String msg = getID(doc, schema) + "missing required field: " + field.getName();
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, msg);
}
}
}
}
if (!forInPlaceUpdate) {
moveLargestFieldLast(out);
}
return out;
}
private static boolean addOriginalField(
Object originalFieldValue,
SchemaField sfield,
boolean forInPlaceUpdate,
Document out,
Set usedFields) {
addField(out, sfield, originalFieldValue, forInPlaceUpdate);
// record the field as having a value
usedFields.add(sfield.getName());
return true;
}
private static boolean addCopyFields(
final IndexSchema schema,
final Object originalFieldValue,
FieldType originalFieldType,
List copyFields,
boolean forInPlaceUpdate,
String uniqueKeyFieldName,
Document out,
Set usedFields) {
boolean used = false;
for (CopyField cf : copyFields) {
SchemaField destinationField = cf.getDestination();
final boolean destHasValues = usedFields.contains(destinationField.getName());
// Dense Vector Fields can only be copied to same field type
if (originalFieldType instanceof DenseVectorField
&& !(destinationField.getType() instanceof DenseVectorField)) {
if (schema.getCopySources(destinationField.getName()).contains("*")) {
continue;
} else {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"The copy field destination must be a DenseVectorField: "
+ destinationField.getName());
}
}
// check if the copy field is a multivalued or not
if (!destinationField.multiValued() && destHasValues) {
throw new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Multiple values encountered for non multiValued copy field "
+ destinationField.getName()
+ ": "
+ originalFieldValue);
}
Object fieldValue = originalFieldValue;
// Perhaps trim the length of a copy field
if (originalFieldValue instanceof CharSequence && cf.getMaxChars() > 0) {
fieldValue = cf.getLimitedValue(originalFieldValue.toString());
}
// TODO ban copyField populating uniqueKeyField; too problematic to support
addField(
out,
destinationField,
fieldValue,
destinationField.getName().equals(uniqueKeyFieldName) ? false : forInPlaceUpdate);
// record the field as having a originalFieldValue
usedFields.add(destinationField.getName());
used = true;
}
return used;
}
private static SolrException unexpectedNestedDocException(
IndexSchema schema, boolean forInPlaceUpdate) {
if (!schema.isUsableForChildDocs()) {
return new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Unable to index docs with children: the schema must "
+ "include definitions for both a uniqueKey field and the '"
+ IndexSchema.ROOT_FIELD_NAME
+ "' field, using the exact same fieldType");
} else if (forInPlaceUpdate) {
return new SolrException(
SolrException.ErrorCode.BAD_REQUEST,
"Unable to index docs with children: for an in-place update, just provide the doc by itself");
} else {
return new SolrException(
SolrException.ErrorCode.SERVER_ERROR,
"A document unexpectedly contained nested child documents");
}
}
/**
* Move the largest stored field last, because Lucene can avoid loading that one if it's not
* needed.
*/
private static void moveLargestFieldLast(Document doc) {
String largestField = null;
int largestFieldLen = -1;
boolean largestIsLast = true;
for (IndexableField field : doc) {
if (!field.fieldType().stored()) {
continue;
}
if (largestIsLast && !field.name().equals(largestField)) {
largestIsLast = false;
}
if (field.numericValue() != null) {
// just ignore these as non-competitive (avoid toString'ing their number)
continue;
}
String strVal = field.stringValue();
if (strVal != null) {
if (strVal.length() > largestFieldLen) {
largestField = field.name();
largestFieldLen = strVal.length();
largestIsLast = true;
}
} else {
BytesRef bytesRef = field.binaryValue();
if (bytesRef != null && bytesRef.length > largestFieldLen) {
largestField = field.name();
largestFieldLen = bytesRef.length;
largestIsLast = true;
}
}
}
if (!largestIsLast
&& largestField != null
&& largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny
List addToEnd = new ArrayList<>();
Iterator iterator = doc.iterator();
while (iterator.hasNext()) {
IndexableField field = iterator.next();
if (field.name().equals(largestField)) {
addToEnd.add(field);
iterator.remove(); // Document may not have "remove" but it's iterator allows mutation
}
}
for (IndexableField field : addToEnd) {
doc.add(field);
}
}
}
}