All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.craftercms.search.batch.impl.AbstractBinaryFileWithMetadataBatchIndexer Maven / Gradle / Ivy

/*
 * Copyright (C) 2007-2019 Crafter Software Corporation. All Rights Reserved.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see .
 */

package org.craftercms.search.batch.impl;

import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.craftercms.commons.file.stores.RemoteFile;
import org.craftercms.commons.file.stores.RemoteFileResolver;
import org.craftercms.commons.lang.RegexUtils;
import org.craftercms.search.batch.BatchIndexer;
import org.craftercms.search.batch.UpdateDetail;
import org.craftercms.search.batch.UpdateSet;
import org.craftercms.search.batch.UpdateStatus;
import org.craftercms.search.batch.exception.BatchIndexingException;
import org.craftercms.core.processors.ItemProcessor;
import org.craftercms.core.processors.impl.ItemProcessorPipeline;
import org.craftercms.core.service.Content;
import org.craftercms.core.service.ContentStoreService;
import org.craftercms.core.service.Context;
import org.craftercms.search.metadata.impl.AbstractMetadataCollector;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.Node;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Required;
import org.springframework.core.io.Resource;
import org.springframework.mail.javamail.ConfigurableMimeFileTypeMap;
import org.springframework.util.LinkedMultiValueMap;
import org.springframework.util.MultiValueMap;

import javax.activation.FileTypeMap;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;

import static org.craftercms.search.batch.utils.IndexingUtils.isMimeTypeSupported;


/**
 * {@link BatchIndexer} that tries to match binary files with metadata files. A
 * metadata
 * file can reference several binary files, and a binary file can be referenced by several metadata files. Also, this
 * indexer supports the concept of "child" binaries, where the parent is the metadata file and the binary file only
 * exists in the index as long as the metadata file exists and it references the binary.
 *
 * @author avasquez
 */
public abstract class AbstractBinaryFileWithMetadataBatchIndexer
    extends AbstractMetadataCollector implements BatchIndexer {

    private static final Logger logger = LoggerFactory.getLogger(AbstractBinaryFileWithMetadataBatchIndexer.class);

    public static final String DEFAULT_METADATA_PATH_FIELD_NAME = "metadataPath";
    public static final String DEFAULT_LOCAL_ID_FIELD_NAME = "localId";

    protected List supportedMimeTypes;
    protected FileTypeMap mimeTypesMap;
    protected RemoteFileResolver remoteFileResolver;
    protected ItemProcessor itemProcessor;
    protected List metadataPathPatterns;
    protected List binaryPathPatterns;
    protected List remoteBinaryPathPatterns;
    protected List childBinaryPathPatterns;
    protected List referenceXPaths;
    protected List includePropertyPatterns;
    protected List excludePropertyPatterns;
    @Deprecated
    protected List excludeMetadataProperties;
    protected String metadataPathFieldName;
    protected String localIdFieldName;
    protected long maxFileSize;

    public AbstractBinaryFileWithMetadataBatchIndexer() {
        mimeTypesMap = new ConfigurableMimeFileTypeMap();
        metadataPathFieldName = DEFAULT_METADATA_PATH_FIELD_NAME;
        localIdFieldName = DEFAULT_LOCAL_ID_FIELD_NAME;
    }

    public void setSupportedMimeTypes(List supportedMimeTypes) {
        this.supportedMimeTypes = supportedMimeTypes;
    }

    public void setRemoteFileResolver(RemoteFileResolver remoteFileResolver) {
        this.remoteFileResolver = remoteFileResolver;
    }

    public void setItemProcessor(ItemProcessor itemProcessor) {
        this.itemProcessor = itemProcessor;
    }

    public void setItemProcessors(List itemProcessors) {
        this.itemProcessor = new ItemProcessorPipeline(itemProcessors);
    }

    public void setMetadataPathPatterns(List metadataPathPatterns) {
        this.metadataPathPatterns = metadataPathPatterns;
    }

    public void setBinaryPathPatterns(List binaryPathPatterns) {
        this.binaryPathPatterns = binaryPathPatterns;
    }

    public void setRemoteBinaryPathPatterns(List remoteBinaryPathPatterns) {
        this.remoteBinaryPathPatterns = remoteBinaryPathPatterns;
    }

    public void setChildBinaryPathPatterns(List childBinaryPathPatterns) {
        this.childBinaryPathPatterns = childBinaryPathPatterns;
    }

    public void setReferenceXPaths(List referenceXPaths) {
        this.referenceXPaths = referenceXPaths;
    }

    public void setIncludePropertyPatterns(List includePropertyPatterns) {
        this.includePropertyPatterns = includePropertyPatterns;
    }

    public void setExcludePropertyPatterns(List excludePropertyPatterns) {
        this.excludePropertyPatterns = excludePropertyPatterns;
    }

    @Deprecated
    public void setExcludeMetadataProperties(List excludeMetadataProperties) {
        this.excludeMetadataProperties = excludeMetadataProperties;
    }

    public void setMetadataPathFieldName(String metadataPathFieldName) {
        this.metadataPathFieldName = metadataPathFieldName;
    }

    public void setLocalIdFieldName(String localIdFieldName) {
        this.localIdFieldName = localIdFieldName;
    }

    @Required
    public void setMaxFileSize(final long maxFileSize) {
        this.maxFileSize = maxFileSize;
    }

    @Override
    public void updateIndex(String indexId, String siteName,
                            ContentStoreService contentStoreService, Context context, UpdateSet updateSet,
                            UpdateStatus updateStatus) throws BatchIndexingException {
        doUpdates(indexId, siteName, contentStoreService, context, updateSet, updateStatus);
        doDeletes(indexId, siteName, contentStoreService, context, updateSet.getDeletePaths(), updateStatus);
    }

    protected void doUpdates(String indexId, String siteName, ContentStoreService contentStoreService, Context context,
                             UpdateSet updateSet, UpdateStatus updateStatus) {
        List updatePaths = updateSet.getUpdatePaths();
        Set metadataUpdatePaths = new LinkedHashSet<>();
        Set binaryUpdatePaths = new LinkedHashSet<>();

        for (String path : updatePaths) {
            if (isMetadata(path)) {
                metadataUpdatePaths.add(path);
            } else if (isBinary(path)) {
                binaryUpdatePaths.add(path);
            }
        }

        for (String metadataPath : metadataUpdatePaths) {
            Collection newBinaryPaths = Collections.emptyList();
            List previousBinaryPaths = searchBinaryPathsFromMetadataPath(indexId, siteName, metadataPath);
            Document metadataDoc = loadMetadata(contentStoreService, context, siteName, metadataPath);

            if (metadataDoc != null) {
                newBinaryPaths = getBinaryFilePaths(metadataDoc);
            }

            // If there are previous binaries that are not associated to the metadata anymore, reindex them without
            // metadata or delete them if they're child binaries.
            updatePreviousBinaries(indexId, siteName, metadataPath, previousBinaryPaths, newBinaryPaths,
                binaryUpdatePaths, context, contentStoreService, updateSet.getUpdateDetail(metadataPath), updateStatus);

            // Index the new associated binaries
            if (CollectionUtils.isNotEmpty(newBinaryPaths)) {
                MultiValueMap metadata = extractMetadata(metadataPath, metadataDoc);

                for (String newBinaryPath : newBinaryPaths) {
                    binaryUpdatePaths.remove(newBinaryPath);

                    Map additionalFields = collectMetadata(metadataPath, contentStoreService, context);
                    metadata.setAll(additionalFields);

                    updateBinaryWithMetadata(indexId, siteName, contentStoreService, context, newBinaryPath, metadata,
                        updateSet.getUpdateDetail(metadataPath), updateStatus);
                }
            }
        }

        for (String binaryPath : binaryUpdatePaths) {
            String metadataPath = searchMetadataPathFromBinaryPath(indexId, siteName, binaryPath);
            if (StringUtils.isNotEmpty(metadataPath)) {
                // If the binary file has an associated metadata, index the file with the metadata
                Document metadataDoc = loadMetadata(contentStoreService, context, siteName, metadataPath);
                if (metadataDoc != null) {
                    MultiValueMap metadata = extractMetadata(metadataPath, metadataDoc);

                    Map additionalFields = collectMetadata(metadataPath, contentStoreService, context);
                    metadata.setAll(additionalFields);

                    updateBinaryWithMetadata(indexId, siteName, contentStoreService, context, binaryPath,
                                             metadata, updateSet.getUpdateDetail(metadataPath), updateStatus);
                }
            } else {
                // If not, index by itself
                updateBinary(indexId, siteName, contentStoreService, context, binaryPath,
                    updateSet.getUpdateDetail(binaryPath), updateStatus);
            }
        }
    }

    protected void updatePreviousBinaries(String indexId, String siteName, String metadataPath,
                                          List previousBinaryPaths, Collection newBinaryPaths,
                                          Set binaryUpdatePaths, Context context,
                                          ContentStoreService contentStoreService,
                                          UpdateDetail updateDetail, UpdateStatus updateStatus) {
        if (CollectionUtils.isNotEmpty(previousBinaryPaths)) {
            for (String previousBinaryPath : previousBinaryPaths) {
                if (CollectionUtils.isEmpty(newBinaryPaths) || !newBinaryPaths.contains(previousBinaryPath)) {
                    binaryUpdatePaths.remove(previousBinaryPath);

                    if (isChildBinary(previousBinaryPath)) {
                        logger.debug(
                                "Reference of child binary {} removed from  parent {}. Deleting binary from index...",
                                previousBinaryPath, metadataPath);

                        doDelete(indexId, siteName, previousBinaryPath, updateStatus);
                    } else {
                        logger.debug("Reference of binary {} removed from {}. Reindexing without metadata...",
                                     previousBinaryPath, metadataPath);

                        updateBinary(indexId, siteName, contentStoreService, context,
                            previousBinaryPath, updateDetail, updateStatus);
                    }
                }
            }
        }
    }

    protected abstract void doDelete(final String indexId, final String siteName, final String previousBinaryPath,
                                     final UpdateStatus updateStatus);

    protected void doDeletes(String indexId, String siteName, ContentStoreService contentStoreService, Context context,
                             List deletePaths, UpdateStatus updateStatus) {
        for (String path : deletePaths) {
            if (isMetadata(path)) {
                List binaryPaths = searchBinaryPathsFromMetadataPath(indexId, siteName, path);
                for (String binaryPath : binaryPaths) {
                    if (isChildBinary(binaryPath)) {
                        logger.debug("Parent of binary {} deleted. Deleting child binary too", binaryPath);

                        // If the binary is a child binary, when the metadata file is deleted, then delete it
                        doDelete(indexId, siteName, binaryPath, updateStatus);
                    } else {
                        logger.debug("Metadata with reference of binary {} deleted. Reindexing without metadata...",
                                     binaryPath);

                        // Else, update binary without metadata
                        updateBinary(indexId, siteName, contentStoreService, context, binaryPath,null, updateStatus);
                    }
                }
            } else if (isBinary(path)) {
                doDelete(indexId, siteName, path, updateStatus);
            }
        }
    }

    protected boolean isMetadata(String path) {
        return RegexUtils.matchesAny(path, metadataPathPatterns);
    }

    protected boolean isBinary(String path) {
        return RegexUtils.matchesAny(path, binaryPathPatterns) &&
               isMimeTypeSupported(mimeTypesMap, supportedMimeTypes, path);
    }

    protected boolean isRemoteBinary(String path) {
        return RegexUtils.matchesAny(path, remoteBinaryPathPatterns);
    }

    protected boolean isChildBinary(String path) {
        return RegexUtils.matchesAny(path, childBinaryPathPatterns);
    }


    protected abstract List searchBinaryPathsFromMetadataPath(String indexId, String siteName,
                                                             String metadataPath);

    protected abstract String searchMetadataPathFromBinaryPath(String indexId, String siteName, String binaryPath);

    protected Document loadMetadata(ContentStoreService contentStoreService, Context context, String siteName,
                                    String metadataPath) {
        try {
            Document metadataDoc = contentStoreService.getItem(context, null, metadataPath,
                                                               itemProcessor).getDescriptorDom();
            if (metadataDoc != null) {
                return metadataDoc;
            } else {
                logger.error("File {}:{} is not a metadata XML descriptor", siteName, metadataPath);
            }
        } catch (Exception e) {
            logger.error("Error retrieving metadata file @ {}:{}", siteName, metadataPath, e);
        }

        return null;
    }

    @SuppressWarnings("unchecked")
    protected Collection getBinaryFilePaths(Document document) {
        if (CollectionUtils.isNotEmpty(referenceXPaths)) {
            Set binaryPaths = new LinkedHashSet<>();
            for (String refXpath : referenceXPaths) {
                List references = document.selectNodes(refXpath);
                if (CollectionUtils.isNotEmpty(references)) {
                    for (Node reference : references) {
                        String referenceValue = reference.getText();
                        if (StringUtils.isNotBlank(referenceValue) &&
                            isMimeTypeSupported(mimeTypesMap, supportedMimeTypes, referenceValue)) {
                            binaryPaths.add(referenceValue);
                        }
                    }
                }
            }
            return binaryPaths;
        }

        return null;
    }

    protected void updateBinaryWithMetadata(String indexId, String siteName,
                                            ContentStoreService contentStoreService, Context context,
                                            String binaryPath, MultiValueMap metadata,
                                            UpdateDetail updateDetail, UpdateStatus updateStatus) {
        try {
            // Check if the binary file is stored remotely
            if (remoteFileResolver != null && isRemoteBinary(binaryPath)) {
                logger.debug("Indexing remote file {}", binaryPath);

                RemoteFile remoteFile = remoteFileResolver.resolve(binaryPath);

                if(remoteFile.getContentLength() > maxFileSize) {
                    logger.warn("Skipping large binary file @ {}", binaryPath);
                } else {
                    doUpdateContent(indexId, siteName, binaryPath, remoteFile.toResource(), metadata, updateDetail,
                                    updateStatus);
                }
            } else {
                Content binaryContent = contentStoreService.findContent(context, binaryPath);
                if (binaryContent == null) {
                    logger.debug("No binary file found @ {}:{}. Empty content will be used for the update", siteName,
                                 binaryPath);

                    binaryContent = new EmptyContent();
                }

                if(binaryContent.getLength() > maxFileSize) {
                    logger.warn("Skipping large binary file @ {}", binaryPath);
                } else {
                    doUpdateContent(indexId, siteName, binaryPath, binaryContent, metadata, updateDetail, updateStatus);
                }
            }
        } catch (Exception e) {
            logger.error("Error when trying to send index update with metadata for binary file {}:{}", siteName,
                         binaryPath, e);
        }
    }

    protected abstract void doUpdateContent(final String indexId, final String siteName, final String binaryPath,
                                            final Resource resource, final MultiValueMap metadata,
                                            final UpdateDetail updateDetail, final UpdateStatus updateStatus);

    protected abstract void doUpdateContent(final String indexId, final String siteName, final String binaryPath,
                                            final Content content, final MultiValueMap metadata,
                                            final UpdateDetail updateDetail, final UpdateStatus updateStatus);

    protected void updateBinary(String indexId, String siteName, ContentStoreService contentStoreService,
                                Context context, String binaryPath, UpdateDetail updateDetail,
                                UpdateStatus updateStatus) {
        try {
            // Check if the binary file is stored remotely
            if (remoteFileResolver != null && isRemoteBinary(binaryPath)) {
                logger.info("Indexing remote file {}", binaryPath);

                RemoteFile remoteFile = remoteFileResolver.resolve(binaryPath);

                if(remoteFile.getContentLength() > maxFileSize) {
                    logger.warn("Skipping large binary file @ {}", binaryPath);
                } else {
                    doUpdateContent(indexId, siteName, binaryPath, remoteFile.toResource(), updateDetail, updateStatus);
                }
            } else {
                Content binaryContent = contentStoreService.findContent(context, binaryPath);
                if (binaryContent != null && binaryContent.getLength() > 0) {
                    if(binaryContent.getLength() > maxFileSize) {
                        logger.warn("Skipping large binary file @ {}", binaryPath);
                    } else {
                        Map metadata = collectMetadata(binaryPath, contentStoreService, context);
                        doUpdateContent(indexId, siteName, binaryPath, binaryContent, updateDetail, updateStatus,
                                        metadata);
                    }
                } else {
                    logger.debug("No binary file found @ {}:{}. Skipping update", siteName, binaryPath);
                }
            }
        } catch (Exception e) {
            logger.error("Error when trying to send index update for binary file {}:{}", siteName, binaryPath, e);
        }
    }

    protected abstract void doUpdateContent(final String indexId, final String siteName, final String binaryPath,
                                            final Resource toResource, final UpdateDetail updateDetail,
                                            final UpdateStatus updateStatus);

    protected abstract void doUpdateContent(final String indexId, final String siteName, final String binaryPath,
                                            final Content content, final UpdateDetail updateDetail,
                                            final UpdateStatus updateStatus, final Map metadata);

    protected MultiValueMap extractMetadata(String path, Document document) {
        MultiValueMap metadata = new LinkedMultiValueMap<>();
        Element rootElem = document.getRootElement();

        extractMetadataFromChildren(rootElem, StringUtils.EMPTY, metadata);

        logger.debug("Extracted metadata: {}", metadata);

        // Add extra metadata ID field
        metadata.set(metadataPathFieldName, path);

        return metadata;
    }

    @SuppressWarnings("unchecked")
    protected void extractMetadataFromChildren(Element element, String key, MultiValueMap metadata) {
        for (Iterator iter = element.nodeIterator(); iter.hasNext(); ) {
            Node node = iter.next();

            if (node instanceof Element) {
                StringBuilder childKey = new StringBuilder(key);

                if (childKey.length() > 0) {
                    childKey.append(".");
                }

                childKey.append(node.getName());

                if (CollectionUtils.isEmpty(excludeMetadataProperties) ||
                    !excludeMetadataProperties.contains(childKey.toString())) {
                    extractMetadataFromChildren((Element) node, childKey.toString(), metadata);
                }
            } else {
                String value = node.getText();
                if (StringUtils.isNotBlank(value) && shouldIncludeProperty(key)) {
                    logger.debug("Adding value [{}] for property [{}]", value, key);

                    metadata.add(key, StringUtils.trim(value));
                }
            }
        }
    }

    protected boolean shouldIncludeProperty(String name) {
        return (CollectionUtils.isEmpty(includePropertyPatterns) ||
                RegexUtils.matchesAny(name, includePropertyPatterns)) &&
               (CollectionUtils.isEmpty(excludePropertyPatterns) ||
                !RegexUtils.matchesAny(name, excludePropertyPatterns));
    }

    public static class EmptyContent implements Content {

        @Override
        public long getLastModified() {
            return System.currentTimeMillis();
        }

        @Override
        public long getLength() {
            return 0;
        }

        @Override
        public InputStream getInputStream() throws IOException {
            return new ByteArrayInputStream(new byte[0]);
        }

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy