All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.update.DocumentBuilder Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.update;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocumentBase;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.schema.CopyField;
import org.apache.solr.schema.DenseVectorField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;

/** Builds a Lucene {@link Document} from a {@link SolrInputDocument}. */
public class DocumentBuilder {

  // accessible only for tests
  static int MIN_LENGTH_TO_MOVE_LAST =
      Integer.getInteger("solr.docBuilder.minLengthToMoveLast", 4 * 1024); // internal setting
  static int MAX_VALUES_AS_STRING_LENGTH = 256;

  /**
   * Add a field value to a given document.
   *
   * @param doc Document that the field needs to be added to
   * @param field The schema field object for the field
   * @param val The value for the field to be added
   * @param forInPlaceUpdate Whether the field is to be added for in-place update. If true, only
   *     numeric docValues based fields are added to the document. This can be true when
   *     constructing a Lucene document for writing an in-place update, and we don't need presence
   *     of non-updatable fields (non NDV) in such a document.
   */
  private static void addField(
      Document doc, SchemaField field, Object val, boolean forInPlaceUpdate) {
    if (val instanceof IndexableField) {
      if (forInPlaceUpdate) {
        assert val instanceof NumericDocValuesField
            : "Expected in-place update to be done on" + " NDV fields only.";
      }
      doc.add((IndexableField) val);
      return;
    }
    for (IndexableField f : field.getType().createFields(field, val)) {
      if (f != null) { // null fields are not added
        // HACK: workaround for SOLR-9809
        // even though at this point in the code we know the field is single valued and DV only
        // TrieField.createFields() may still return (usless) IndexableField instances that are not
        // NumericDocValuesField instances.
        //
        // once SOLR-9809 is resolved, we should be able to replace this conditional with...
        //    assert f instanceof NumericDocValuesField
        if (forInPlaceUpdate) {
          if (f instanceof NumericDocValuesField) {
            doc.add(f);
          }
        } else {
          doc.add(f);
        }
      }
    }
  }

  private static String getID(SolrInputDocument doc, IndexSchema schema) {
    String id = "";
    SchemaField sf = schema.getUniqueKeyField();
    if (sf != null) {
      id = "[doc=" + doc.getFieldValue(sf.getName()) + "] ";
    }
    return id;
  }

  /**
   * @see DocumentBuilder#toDocument(SolrInputDocument, IndexSchema, boolean, boolean)
   */
  public static Document toDocument(SolrInputDocument doc, IndexSchema schema) {
    return toDocument(doc, schema, false, true);
  }

  /**
   * Convert a SolrInputDocument to a lucene Document.
   *
   * 

This function should go elsewhere. This builds the Document without an extra Map<> * checking for multiple values. For more discussion, see: * http://www.nabble.com/Re%3A-svn-commit%3A-r547493---in--lucene-solr-trunk%3A-.--src-java-org-apache-solr-common--src-java-org-apache-solr-schema--src-java-org-apache-solr-update--src-test-org-apache-solr-common--tf3931539.html * *

TODO: /!\ NOTE /!\ This semantics of this function are still in flux. Something somewhere * needs to be able to fill up a SolrDocument from a lucene document - this is one place that may * happen. It may also be moved to an independent function * * @since solr 1.3 * @param doc SolrInputDocument from which the document has to be built * @param schema Schema instance * @param forInPlaceUpdate Whether the output document would be used for an in-place update or * not. When this is true, default fields values and copy fields targets are not populated. * @param ignoreNestedDocs if nested child documents should be ignored. If false then an exception * will be thrown. * @return Built Lucene document */ public static Document toDocument( SolrInputDocument doc, IndexSchema schema, boolean forInPlaceUpdate, boolean ignoreNestedDocs) { if (!ignoreNestedDocs && doc.hasChildDocuments()) { throw unexpectedNestedDocException(schema, forInPlaceUpdate); } final SchemaField uniqueKeyField = schema.getUniqueKeyField(); final String uniqueKeyFieldName = null == uniqueKeyField ? null : uniqueKeyField.getName(); Document out = new Document(); Set usedFields = new HashSet<>(); // Load fields from SolrDocument to Document for (SolrInputField field : doc) { // when in-place update, don't process the id & _root_; they won't change if (forInPlaceUpdate) { if (field.getName().equals(uniqueKeyFieldName) || field.getName().equals(IndexSchema.ROOT_FIELD_NAME)) { continue; } } if (field.getFirstValue() instanceof SolrDocumentBase) { if (ignoreNestedDocs) { continue; } throw unexpectedNestedDocException(schema, forInPlaceUpdate); } String name = field.getName(); SchemaField sfield = schema.getFieldOrNull(name); boolean used = false; // Make sure it has the correct number if (sfield != null && !sfield.multiValued() && field.getValueCount() > 1 && !(sfield.getType() instanceof DenseVectorField)) { // Ensure we do not flood the logs with extremely long values String fieldValue = field.getValue().toString(); if (fieldValue.length() > MAX_VALUES_AS_STRING_LENGTH) { assert fieldValue.endsWith("]"); fieldValue = fieldValue.substring(0, MAX_VALUES_AS_STRING_LENGTH - 4) + "...]"; } throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "ERROR: " + getID(doc, schema) + "multiple values encountered for non multiValued field " + sfield.getName() + ": " + fieldValue); } List copyFields = schema.getCopyFieldsList(name); if (copyFields.size() == 0) copyFields = null; // load each field value boolean hasField = false; try { if (sfield != null && sfield.getType() instanceof DenseVectorField) { Object vectorValue = field.getValue(); if (vectorValue != null) { hasField = true; used = addOriginalField(vectorValue, sfield, forInPlaceUpdate, out, usedFields); // Check if we should copy this field value to any other fields. // This could happen whether it is explicit or not. if (copyFields != null) { used |= addCopyFields( schema, vectorValue, sfield.getType(), copyFields, forInPlaceUpdate, uniqueKeyFieldName, out, usedFields); } } } else { Iterator it = field.iterator(); while (it.hasNext()) { Object v = it.next(); if (v == null) { continue; } hasField = true; if (sfield != null) { used = addOriginalField(v, sfield, forInPlaceUpdate, out, usedFields); } // Check if we should copy this field value to any other fields. // This could happen whether it is explicit or not. if (copyFields != null) { used |= addCopyFields( schema, v, sfield.getType(), copyFields, forInPlaceUpdate, uniqueKeyFieldName, out, usedFields); } } } } catch (SolrException ex) { throw new SolrException( SolrException.ErrorCode.getErrorCode(ex.code()), "ERROR: " + getID(doc, schema) + "Error adding field '" + field.getName() + "'='" + field.getValue() + "' msg=" + ex.getMessage(), ex); } catch (Exception ex) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "ERROR: " + getID(doc, schema) + "Error adding field '" + field.getName() + "'='" + field.getValue() + "' msg=" + ex.getMessage(), ex); } // make sure the field was used somehow... if (!used && hasField) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "ERROR: " + getID(doc, schema) + "unknown field '" + name + "'"); } } // Now validate required fields or add default values // fields with default values are defacto 'required' // Note: We don't need to add required fields if this document is to be used for // in-place updates, since this validation and population of required fields would've happened // during the full indexing initially. if (!forInPlaceUpdate) { for (SchemaField field : schema.getRequiredFields()) { if (out.getField(field.getName()) == null) { if (field.getDefaultValue() != null) { addField(out, field, field.getDefaultValue(), false); } else { String msg = getID(doc, schema) + "missing required field: " + field.getName(); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, msg); } } } } if (!forInPlaceUpdate) { moveLargestFieldLast(out); } return out; } private static boolean addOriginalField( Object originalFieldValue, SchemaField sfield, boolean forInPlaceUpdate, Document out, Set usedFields) { addField(out, sfield, originalFieldValue, forInPlaceUpdate); // record the field as having a value usedFields.add(sfield.getName()); return true; } private static boolean addCopyFields( final IndexSchema schema, final Object originalFieldValue, FieldType originalFieldType, List copyFields, boolean forInPlaceUpdate, String uniqueKeyFieldName, Document out, Set usedFields) { boolean used = false; for (CopyField cf : copyFields) { SchemaField destinationField = cf.getDestination(); final boolean destHasValues = usedFields.contains(destinationField.getName()); // Dense Vector Fields can only be copied to same field type if (originalFieldType instanceof DenseVectorField && !(destinationField.getType() instanceof DenseVectorField)) { if (schema.getCopySources(destinationField.getName()).contains("*")) { continue; } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "The copy field destination must be a DenseVectorField: " + destinationField.getName()); } } // check if the copy field is a multivalued or not if (!destinationField.multiValued() && destHasValues) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Multiple values encountered for non multiValued copy field " + destinationField.getName() + ": " + originalFieldValue); } Object fieldValue = originalFieldValue; // Perhaps trim the length of a copy field if (originalFieldValue instanceof CharSequence && cf.getMaxChars() > 0) { fieldValue = cf.getLimitedValue(originalFieldValue.toString()); } // TODO ban copyField populating uniqueKeyField; too problematic to support addField( out, destinationField, fieldValue, destinationField.getName().equals(uniqueKeyFieldName) ? false : forInPlaceUpdate); // record the field as having a originalFieldValue usedFields.add(destinationField.getName()); used = true; } return used; } private static SolrException unexpectedNestedDocException( IndexSchema schema, boolean forInPlaceUpdate) { if (!schema.isUsableForChildDocs()) { return new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unable to index docs with children: the schema must " + "include definitions for both a uniqueKey field and the '" + IndexSchema.ROOT_FIELD_NAME + "' field, using the exact same fieldType"); } else if (forInPlaceUpdate) { return new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unable to index docs with children: for an in-place update, just provide the doc by itself"); } else { return new SolrException( SolrException.ErrorCode.SERVER_ERROR, "A document unexpectedly contained nested child documents"); } } /** * Move the largest stored field last, because Lucene can avoid loading that one if it's not * needed. */ private static void moveLargestFieldLast(Document doc) { String largestField = null; int largestFieldLen = -1; boolean largestIsLast = true; for (IndexableField field : doc) { if (!field.fieldType().stored()) { continue; } if (largestIsLast && !field.name().equals(largestField)) { largestIsLast = false; } if (field.numericValue() != null) { // just ignore these as non-competitive (avoid toString'ing their number) continue; } String strVal = field.stringValue(); if (strVal != null) { if (strVal.length() > largestFieldLen) { largestField = field.name(); largestFieldLen = strVal.length(); largestIsLast = true; } } else { BytesRef bytesRef = field.binaryValue(); if (bytesRef != null && bytesRef.length > largestFieldLen) { largestField = field.name(); largestFieldLen = bytesRef.length; largestIsLast = true; } } } if (!largestIsLast && largestField != null && largestFieldLen > MIN_LENGTH_TO_MOVE_LAST) { // only bother if the value isn't tiny List addToEnd = new ArrayList<>(); Iterator iterator = doc.iterator(); while (iterator.hasNext()) { IndexableField field = iterator.next(); if (field.name().equals(largestField)) { addToEnd.add(field); iterator.remove(); // Document may not have "remove" but it's iterator allows mutation } } for (IndexableField field : addToEnd) { doc.add(field); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy