org.opencms.search.solr.spellchecking.CmsSpellcheckDictionaryIndexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opencms-test Show documentation
OpenCms is an enterprise-ready, easy to use website content management system based on Java and XML technology. Offering a complete set of features, OpenCms helps content managers worldwide to create and maintain beautiful websites fast and efficiently.
There is a newer version: 18.0
Show newest version
/*
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (c) Alkacon Software GmbH & Co. KG (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.search.solr.spellchecking;

import org.opencms.file.CmsFile;
import org.opencms.file.CmsObject;
import org.opencms.file.CmsProject;
import org.opencms.file.CmsRequestContext;
import org.opencms.file.CmsResource;
import org.opencms.file.CmsResourceFilter;
import org.opencms.main.CmsException;
import org.opencms.main.CmsLog;
import org.opencms.main.OpenCms;
import org.opencms.main.OpenCmsServlet;
import org.opencms.util.CmsStringUtil;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

import org.apache.commons.logging.Log;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrInputDocument;

/**
 * Helping class for manipulating the Solr spellchecker indices.
 */
public final class CmsSpellcheckDictionaryIndexer {

    /** The log object for this class. */
    private static final Log LOG = CmsLog.getLog(OpenCmsServlet.class);

    /** The default directory that's holding the dictionary files. */
    public static final String DEFAULT_DICTIONARY_DIRECTORY = "/system/modules/org.opencms.workplace.spellcheck/resources";

    /** A regex pattern that applies to the Solr spellcheck directories.
     * Matching string example: "spellchecker_en" */
    public static final String INDEXES_REGEX = "spellchecker_[a-z]{2}";

    /** A regex pattern that applies to custom dictionaries.
     * Matching string example: "custom_dict_en.txt" */
    public static final String CUSTOM_DICTIONARY = "custom_dict_[a-z]{2}.txt";

    /** A regex pattern that applies to the naming of the dictionary files.
     * Matching string example: "dict_en.txt" */
    public static final String DICTIONARY_NAME_REGEX = "dict_[a-z]{2}.txt";

    /** A regex pattern that applies to the naming of zipped dictionary files.
     * Matching string example: "dict_en.zip" */
    public static final String ZIP_NAME_REGEX = "dict_[a-z]{2}.zip";

    /** Maximum amount of entries while parsing the dictionary. This variable is needed
     * in order to prevent OutOfMemoryExceptions while parsing large dictionaries. If you
     * encounter such exceptions you can adjust its value to a smaller number. */
    private static final int MAX_LIST_SIZE = 100000;

    /**
     * FileFilter implementation that returns only directories whose name matches
     * the spellchecker indices regex.
     */
    private static final FileFilter SPELLCHECKING_DIRECTORY_NAME_FILTER = new FileFilter() {

        public boolean accept(File f) {

            return f.isDirectory() && f.getName().matches(INDEXES_REGEX);
        }
    };

    /**
     * Default constructor is private as each method is static.
     */
    private CmsSpellcheckDictionaryIndexer() {

    }

    /**
     * Adds all dictionaries that are available in the default directory. 
     *
     * @param client The SolrClient instance object.
     * @param cms the cms context
     */
    public static void parseAndAddDictionaries(SolrClient client, CmsObject cms) {

        if ((null == client) || (null == cms)) {
            return;
        }

        // Set the correct cms context
        setCmsOfflineProject(cms);

        try {
            // Get all file resources in the default dictionary directory
            final List resources = cms.getResourcesInFolder(
                DEFAULT_DICTIONARY_DIRECTORY,
                CmsResourceFilter.DEFAULT_FILES);

            for (final CmsResource resource : resources) {
                final String resourceName = resource.getName();
                // Check if the name of the file matches the dictionary naming scheme
                String lang = null;
                if (resourceName.matches(DICTIONARY_NAME_REGEX)) {
                    // Extract the language code that consists of two letters (de, en, es, ...)
                    lang = resourceName.substring(5, 7);
                } else if (resourceName.matches(CUSTOM_DICTIONARY)) {
                    lang = resourceName.substring(12, 14);
                }

                if (null != lang) {
                    // Read the file
                    final CmsFile file = cms.readFile(resource);

                    // Parse file content and add it to the server
                    final List documents = new ArrayList();

                    readAndAddDocumentsFromStream(
                        client,
                        lang,
                        new ByteArrayInputStream(file.getContents()),
                        documents,
                        true);

                    // Add and commit the remaining documents to the server
                    addDocuments(client, documents, true);
                }
            }

        } catch (CmsException e) {
            LOG.warn("Could not read from resource. ");
        } catch (IOException e) {
            LOG.warn("Could not successfully parse the dictionary. ");
        } catch (SolrServerException e) {
            LOG.warn("Exception while adding documents to Solr server. ");
        }
    }

    /**
     *
     * @param client The SolrClient instance object.
     * @param cms The OpenCms instance object.
     */
    public static void parseAndAddZippedDictionaries(SolrClient client, CmsObject cms) {

        try {
            final List resources = cms.getResourcesInFolder(
                DEFAULT_DICTIONARY_DIRECTORY,
                CmsResourceFilter.DEFAULT_FILES);

            // List holding all input documents, regardless of language
            final List documents = new LinkedList();

            for (CmsResource resource : resources) {
                final String zipFileName = resource.getName();
                if (zipFileName.matches(ZIP_NAME_REGEX)) {
                    final CmsFile cmsFile = cms.readFile(resource);

                    // Read zip file content
                    final ZipInputStream zipStream = new ZipInputStream(
                        new ByteArrayInputStream(cmsFile.getContents()));

                    // Holds several entries (files) of the zipfile
                    ZipEntry entry = zipStream.getNextEntry();

                    // Iterate over each files in the zip file
                    while (null != entry) {
                        // Extract name to check if name matches the regex and to guess the
                        // language from the filename
                        final String name = entry.getName();

                        if (name.matches(DICTIONARY_NAME_REGEX)) {

                            // The (matching) filename reveals the language
                            final String lang = name.substring(5, 7);

                            // Parse and add documents
                            readAndAddDocumentsFromStream(client, lang, zipStream, documents, false);

                            // Get the next file in the zip
                            entry = zipStream.getNextEntry();
                        }

                    }
                }
            }

            // Add all documents
            addDocuments(client, documents, true);
        } catch (IOException e) {
            LOG.warn("Failed while reading from " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
        } catch (CmsException e) {
            LOG.warn("Failed reading resource " + DEFAULT_DICTIONARY_DIRECTORY + ". ");
        } catch (SolrServerException e) {
            LOG.warn("Failed adding documents to Solr server. ");
        }
    }

    /**
     * Checks whether a built of the indices is necessary.
     * @param cms The appropriate CmsObject instance.
     * @return true, if the spellcheck indices have to be rebuilt, otherwise false
     */
    public static boolean updatingIndexNecessesary(CmsObject cms) {

        // Set request to the offline project.
        setCmsOfflineProject(cms);

        // Check whether the spellcheck index directories are empty.
        // If they are, the index has to be built obviously.
        if (isSolrSpellcheckIndexDirectoryEmpty()) {
            return true;
        }

        // Compare the most recent date of a dictionary with the oldest timestamp
        // that determines when an index has been built.
        long dateMostRecentDictionary = getMostRecentDate(cms);
        long dateOldestIndexWrite = getOldestIndexDate(cms);

        return dateMostRecentDictionary > dateOldestIndexWrite;
    }

    /**
     * Add a list of documents to the Solr client.

     *
     * @param client The SolrClient instance object.
     * @param documents The documents that should be added.
     * @param commit boolean flag indicating whether a "commit" call should be made after adding the documents
     *
     * @throws IOException in case something goes wrong
     * @throws SolrServerException in case something goes wrong
     */
    static void addDocuments(SolrClient client, List documents, boolean commit)
    throws IOException, SolrServerException {

        if ((null == client) || (null == documents)) {
            return;
        }

        if (!documents.isEmpty()) {
            client.add(documents);
        }

        if (commit) {
            client.commit();
        }
    }

    /**
     * Deletes all documents from the Solr client.

     *
     * @param client The SolrClient instance object.
     *
     * @throws IOException in case something goes wrong
     * @throws SolrServerException in case something goes wrong
     */
    static void deleteAllFiles(SolrClient client) throws IOException, SolrServerException {

        if (null == client) {
            return;
        }

        client.deleteByQuery("*:*");
        client.commit();
    }

    /**
     * Deletes a single document from the Solr client.

     *
     * @param client The SolrClient instance object.
     * @param lang The affected language.
     * @param word The word that should be removed.
     *
     * @throws IOException in case something goes wrong
     * @throws SolrServerException in case something goes wrong
     */
    static void deleteDocument(SolrClient client, String lang, String word) throws IOException, SolrServerException {

        if ((null == client)
            || CmsStringUtil.isEmptyOrWhitespaceOnly(lang)
            || CmsStringUtil.isEmptyOrWhitespaceOnly(word)) {
            return;
        }

        // Make sure the parameter holding the word that should be deleted
        // contains just a single word
        if (word.trim().contains(" ")) {
            final String query = String.format("entry_%s:%s", lang, word);
            client.deleteByQuery(query);
        }
    }

    /**
     * Determines and returns the timestamp of the most recently modified spellchecker file.

     *
     * @param cms the OpenCms instance.
     * @return timestamp of type long.
     */
    private static long getMostRecentDate(CmsObject cms) {

        long mostRecentDate = Long.MIN_VALUE;

        try {
            final List resources = cms.getResourcesInFolder(
                DEFAULT_DICTIONARY_DIRECTORY,
                CmsResourceFilter.DEFAULT_FILES);

            for (final CmsResource resource : resources) {
                final String resourceName = resource.getName();
                // Check whether the resource matches the desired patterns
                if (resourceName.matches(DICTIONARY_NAME_REGEX)
                    || resourceName.matches(ZIP_NAME_REGEX)
                    || resourceName.matches(CUSTOM_DICTIONARY)) {
                    if (resource.getDateLastModified() > mostRecentDate) {
                        mostRecentDate = resource.getDateLastModified();
                    }
                }
            }
        } catch (CmsException e) {
            LOG.error("Could not read spellchecker dictionaries. ");
        }

        return mostRecentDate;
    }

    /**
     * Returns the timestamp of the index whose index-built operation lies the
     * furthest back in the past.
     *
     * @param cms the OpenCms instance.
     * @return timestamp as type long.
     */
    private static long getOldestIndexDate(CmsObject cms) {

        final File path = new File(getSolrSpellcheckRfsPath());
        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);

        // Initialize with the greatest value a long type can hold
        long oldestIndexDate = Long.MAX_VALUE;

        for (final File dir : directories) {
            long date = dir.lastModified();
            if (date < oldestIndexDate) {
                oldestIndexDate = date;
            }
        }

        // If no file(s) have been found oldestIndexDate is still holding
        // Long.MAX_VALUE. In that case return Long.MIN_VALUE to ensure
        // that no indexing operation takes place.
        if (Long.MAX_VALUE == oldestIndexDate) {
            LOG.warn("It appears that no spellcheck indices have been found in " + getSolrSpellcheckRfsPath() + ". ");
            return Long.MIN_VALUE;
        }

        return oldestIndexDate;
    }

    /**
     * Returns the path in the RFS where the Solr spellcheck files reside.
     * @return String representation of Solrs spellcheck RFS path.
     */
    private static String getSolrSpellcheckRfsPath() {

        String sPath = OpenCms.getSystemInfo().getWebInfRfsPath();

        if (!OpenCms.getSystemInfo().getWebInfRfsPath().endsWith(File.separator)) {
            sPath += File.separator;
        }

        return sPath + "solr" + File.separator + "spellcheck" + File.separator + "data";
    }

    /**
     * Returns whether the Solr spellchecking index directories are empty
     * (not initiliazed) or not.
     * @return true, if the directories contain no indexed data, otherwise false.
     */
    private static boolean isSolrSpellcheckIndexDirectoryEmpty() {

        final File path = new File(getSolrSpellcheckRfsPath());
        final File[] directories = path.listFiles(SPELLCHECKING_DIRECTORY_NAME_FILTER);

        // Each directory that has been created by Solr but hasn't been indexed yet
        // contains exactly two files. If there are more files, at least one index has
        // already been built, so return false in that case.
        if (directories != null) {
            for (final File directory : directories) {
                if (directory.list().length > 2) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Parses the dictionary from an InputStream.
     *
     * @param client The SolrClient instance object.
     * @param lang The language of the dictionary.
     * @param is The InputStream object.
     * @param documents List to put the assembled SolrInputObjects into.
     * @param closeStream boolean flag that determines whether to close the inputstream
     * or not.
     */
    private static void readAndAddDocumentsFromStream(
        final SolrClient client,
        final String lang,
        final InputStream is,
        final List documents,
        final boolean closeStream) {

        final BufferedReader br = new BufferedReader(new InputStreamReader(is));

        try {
            String line = br.readLine();
            while (null != line) {

                final SolrInputDocument document = new SolrInputDocument();
                // Each field is named after the schema "entry_xx" where xx denotes
                // the two digit language code. See the file spellcheck/conf/schema.xml.
                document.addField("entry_" + lang, line);
                documents.add(document);

                // Prevent OutOfMemoryExceptions ...
                if (documents.size() >= MAX_LIST_SIZE) {
                    addDocuments(client, documents, false);
                    documents.clear();
                }

                line = br.readLine();
            }
        } catch (IOException e) {
            LOG.error("Could not read spellcheck dictionary from input stream.");
        } catch (SolrServerException e) {
            LOG.error("Error while adding documents to Solr server. ");
        } finally {
            try {
                if (closeStream) {
                    br.close();
                }
            } catch (Exception e) {
                // Nothing to do here anymore ....
            }
        }
    }

    /**
     * Sets the appropriate OpenCms context.
     * @param cms The OpenCms instance object.
     */
    private static void setCmsOfflineProject(CmsObject cms) {

        if (null == cms) {
            return;
        }

        final CmsRequestContext cmsContext = cms.getRequestContext();
        final CmsProject cmsProject = cmsContext.getCurrentProject();

        if (cmsProject.isOnlineProject()) {
            CmsProject cmsOfflineProject;
            try {
                cmsOfflineProject = cms.readProject("Offline");
                cmsContext.setCurrentProject(cmsOfflineProject);
            } catch (CmsException e) {
                LOG.warn("Could not set the current project to \"Offline\". ");
            }
        }
    }
}