org.opencms.search.documents.A_CmsVfsDocument Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opencms-test Show documentation
OpenCms is an enterprise-ready, easy to use website content management system based on Java and XML technology. Offering a complete set of features, OpenCms helps content managers worldwide to create and maintain beautiful websites fast and efficiently.
There is a newer version: 18.0
Show newest version
/*
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software GmbH & Co. KG, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.search.documents;

import org.opencms.file.CmsFile;
import org.opencms.file.CmsObject;
import org.opencms.file.CmsResource;
import org.opencms.file.types.I_CmsResourceType;
import org.opencms.main.CmsException;
import org.opencms.main.CmsLog;
import org.opencms.main.OpenCms;
import org.opencms.search.CmsSearchIndex;
import org.opencms.search.I_CmsSearchDocument;
import org.opencms.search.extractors.CmsExtractionResult;
import org.opencms.search.extractors.I_CmsExtractionResult;
import org.opencms.search.fields.CmsSearchField;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;

/**
 * Base document factory class for a VFS {@link org.opencms.file.CmsResource},
 * just requires a specialized implementation of
 * {@link I_CmsDocumentFactory#extractContent(CmsObject, CmsResource, CmsSearchIndex)}
 * for text extraction from the binary document content.
 *
 * @since 6.0.0
 */
public abstract class A_CmsVfsDocument implements I_CmsDocumentFactory {

    /** The log object for this class. */
    private static final Log LOG = CmsLog.getLog(A_CmsVfsDocument.class);

    /** Name of the document type. */
    protected String m_name;

    /** The cache used for storing extracted documents. */
    private CmsExtractionResultCache m_cache;

    /**
     * Creates a new instance of this lucene document factory.

     *
     * @param name name of the documenttype
     */
    public A_CmsVfsDocument(String name) {

        m_name = name;
    }

    /**
     * Creates a document factory lookup key for the given resource type name / MIME type configuration.

     *
     * If the given mimeType is null, this indicates that the key should
     * match all VFS resource of the given resource type regardless of the MIME type.

     *
     * @param type the resource type name to use
     * @param mimeType the MIME type to use
     *
     * @return a document factory lookup key for the given resource id / MIME type configuration
     */
    public static String getDocumentKey(String type, String mimeType) {

        StringBuffer result = new StringBuffer(16);
        result.append(I_CmsSearchDocument.VFS_DOCUMENT_KEY_PREFIX);
        result.append('_');
        result.append(type);
        if (mimeType != null) {
            result.append(':');
            result.append(mimeType);
        }
        return result.toString();
    }

    /**
     * Generates a new lucene document instance from contents of the given resource for the provided index.

     *
     * @see org.opencms.search.documents.I_CmsDocumentFactory#createDocument(CmsObject, CmsResource, CmsSearchIndex)
     */
    public I_CmsSearchDocument createDocument(CmsObject cms, CmsResource resource, CmsSearchIndex index)
    throws CmsException {

        // extract the content from the resource
        I_CmsExtractionResult content = null;

        if (index.isExtractingContent()) {
            // do full text content extraction only if required

            // check if caching is enabled for this document type
            CmsExtractionResultCache cache = getCache();
            String cacheName = null;
            if ((cache != null) && (resource.getSiblingCount() > 1)) {
                // hard drive based caching only makes sense for resources that have siblings,
                // because the index will also store the content as a blob
                cacheName = cache.getCacheName(
                    resource,
                    isLocaleDependend() ? index.getLocaleForResource(cms, resource, null) : null,
                    getName());
                content = cache.getCacheObject(cacheName);
            }

            if (content == null) {
                // extraction result has not been found in the cache
                // compare "date of last modification of content" from Lucene index and OpenCms VFS
                // if this is identical, then the data from the Lucene index can be re-used
                I_CmsSearchDocument oldDoc = index.getDocument(CmsSearchField.FIELD_PATH, resource.getRootPath());
                // first check if the document is already in the index
                if ((oldDoc != null) && (oldDoc.getFieldValueAsDate(CmsSearchField.FIELD_DATE_CONTENT) != null)) {
                    long contentDateIndex = oldDoc.getFieldValueAsDate(CmsSearchField.FIELD_DATE_CONTENT).getTime();
                    // now compare the date with the date stored in the resource
                    if (contentDateIndex == resource.getDateContent()) {
                        // extract stored content blob from index
                        content = CmsExtractionResult.fromBytes(oldDoc.getContentBlob());
                    }
                }
            }

            if (content == null) {
                // extraction result has not been attached to the resource
                try {
                    content = extractContent(cms, resource, index);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Extracting content for '" + resource.getRootPath() + "' successful.");
                    }
                    if ((cache != null) && (resource.getSiblingCount() > 1)) {
                        // save extracted content to the cache
                        cache.saveCacheObject(cacheName, content);
                    }
                } catch (CmsIndexNoContentException e) {
                    // there was no content found for the resource
                    LOG.info(
                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath())
                            + " "
                            + e.getMessage());
                } catch (Throwable e) {
                    // text extraction failed for document - continue indexing meta information only
                    LOG.error(
                        Messages.get().getBundle().key(Messages.ERR_TEXT_EXTRACTION_1, resource.getRootPath()),
                        e);
                }
            }
        }

        // create the Lucene document according to the index field configuration
        return index.getFieldConfiguration().createDocument(cms, resource, index, content);
    }

    /**
     * @see org.opencms.search.documents.I_CmsDocumentFactory#getCache()
     */
    public CmsExtractionResultCache getCache() {

        return m_cache;
    }

    /**
     * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List)
     */
    public List getDocumentKeys(List resourceTypes, List mimeTypes) throws CmsException {

        List keys = new ArrayList();

        if (resourceTypes.contains("*")) {
            List allTypes = new ArrayList();
            for (Iterator i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) {
                I_CmsResourceType resourceType = i.next();
                allTypes.add(resourceType.getTypeName());
            }
            resourceTypes = allTypes;
        }

        try {
            for (Iterator i = resourceTypes.iterator(); i.hasNext();) {

                String typeName = i.next();
                for (Iterator j = mimeTypes.iterator(); j.hasNext();) {
                    keys.add(getDocumentKey(typeName, j.next()));
                }
                if (mimeTypes.isEmpty()) {
                    keys.add(getDocumentKey(typeName, null));
                }
            }
        } catch (Exception exc) {
            throw new CmsException(Messages.get().container(Messages.ERR_CREATE_DOC_KEY_0), exc);
        }

        return keys;
    }

    /**
     * @see org.opencms.search.documents.I_CmsDocumentFactory#getName()
     */
    public String getName() {

        return m_name;
    }

    /**
     * @see org.opencms.search.documents.I_CmsDocumentFactory#setCache(org.opencms.search.documents.CmsExtractionResultCache)
     */
    public void setCache(CmsExtractionResultCache cache) {

        m_cache = cache;
    }

    /**
     * Logs content extraction for the specified resource and index.

     *
     * @param resource the resource to log content extraction for
     * @param index the search index to log content extraction for
     */
    protected void logContentExtraction(CmsResource resource, CmsSearchIndex index) {

        if (LOG.isDebugEnabled()) {
            LOG.debug(
                Messages.get().getBundle().key(
                    Messages.LOG_EXTRACT_CONTENT_2,
                    resource.getRootPath(),
                    index.getName()));
        }
    }

    /**
     * Upgrades the given resource to a {@link CmsFile} with content.
     *
     * @param cms the current users OpenCms context
     * @param resource the resource to upgrade
     *
     * @return the given resource upgraded to a {@link CmsFile} with content
     *
     * @throws CmsException if the resource could not be read
     * @throws CmsIndexNoContentException if the resource has no content
     */
    protected CmsFile readFile(CmsObject cms, CmsResource resource) throws CmsException, CmsIndexNoContentException {

        CmsFile file = cms.readFile(resource);
        if (file.getLength() <= 0) {
            throw new CmsIndexNoContentException(
                Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath()));
        }
        return file;
    }
}