All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opencms.search.solr.CmsSolrDocumentXmlContent Maven / Gradle / Ivy

Go to download

OpenCms is an enterprise-ready, easy to use website content management system based on Java and XML technology. Offering a complete set of features, OpenCms helps content managers worldwide to create and maintain beautiful websites fast and efficiently.

There is a newer version: 18.0
Show newest version
/*
 * File   : $Source$
 * Date   : $Date$
 * Version: $Revision$
 *
 * This library is part of OpenCms -
 * the Open Source Content Management System
 *
 * Copyright (C) 2002 - 2009 Alkacon Software (http://www.alkacon.com)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Lesser General Public License for more details.
 *
 * For further information about Alkacon Software, please see the
 * company website: http://www.alkacon.com
 *
 * For further information about OpenCms, please see the
 * project website: http://www.opencms.org
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package org.opencms.search.solr;

import org.opencms.file.CmsFile;
import org.opencms.file.CmsObject;
import org.opencms.file.CmsPropertyDefinition;
import org.opencms.file.CmsResource;
import org.opencms.file.types.CmsResourceTypeXmlContent;
import org.opencms.file.types.I_CmsResourceType;
import org.opencms.i18n.CmsLocaleManager;
import org.opencms.jsp.CmsJspTagContainer;
import org.opencms.main.CmsException;
import org.opencms.main.CmsLog;
import org.opencms.main.OpenCms;
import org.opencms.search.CmsIndexException;
import org.opencms.search.CmsSearchIndex;
import org.opencms.search.documents.A_CmsVfsDocument;
import org.opencms.search.documents.CmsIndexNoContentException;
import org.opencms.search.documents.Messages;
import org.opencms.search.extractors.CmsExtractionResult;
import org.opencms.search.extractors.I_CmsExtractionResult;
import org.opencms.search.fields.CmsSearchField;
import org.opencms.search.fields.CmsSearchFieldConfiguration;
import org.opencms.search.galleries.CmsGalleryNameMacroResolver;
import org.opencms.util.CmsStringUtil;
import org.opencms.xml.A_CmsXmlDocument;
import org.opencms.xml.CmsXmlContentDefinition;
import org.opencms.xml.CmsXmlUtils;
import org.opencms.xml.content.CmsXmlContent;
import org.opencms.xml.content.CmsXmlContentFactory;
import org.opencms.xml.content.I_CmsXmlContentHandler;
import org.opencms.xml.types.CmsXmlNestedContentDefinition;
import org.opencms.xml.types.I_CmsXmlContentValue;
import org.opencms.xml.types.I_CmsXmlSchemaType;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;

import com.google.common.collect.Sets;

/**
 * Special document text extraction factory for Solr index.

* * @since 8.5.0 */ public class CmsSolrDocumentXmlContent extends A_CmsVfsDocument { /** * The gallery name is determined by resolving the macros in a string which can either come from a field mapped * to the gallery name, or the title, or from default values for those fields. This class is used to select the * value to use and performs the macro substitution. */ private static class GalleryNameChooser { /** CMS context for this instance. */ private CmsObject m_cms; /** Current XML content. */ private A_CmsXmlDocument m_content; /** Default value of field mapped to gallery name. */ private String m_defaultGalleryNameValue; /** Default value of field mapped to title. */ private String m_defaultTitleValue; /** Current locale. */ private Locale m_locale; /** Content value mapped to gallery name. */ private String m_mappedGalleryNameValue; /** Content value mapped to title. */ private String m_mappedTitleValue; /** * Creates a new instance.

* * @param cms the CMS context * @param content the XML content * @param locale the locale in the XML content */ public GalleryNameChooser(CmsObject cms, A_CmsXmlDocument content, Locale locale) { m_cms = cms; m_content = content; m_locale = locale; } /** * Selects the gallery name.

* * This method assumes that all the available values have been set via the setters of this class. * * @return the gallery name * * @throws CmsException of something goes wrong */ public String getGalleryName() throws CmsException { String result = null; for (String resultCandidateWithMacros : new String[] { // Prioritize gallery name over title, and actual content values over defaults m_mappedGalleryNameValue, m_defaultGalleryNameValue, m_mappedTitleValue, m_defaultTitleValue}) { if (!CmsStringUtil.isEmptyOrWhitespaceOnly(resultCandidateWithMacros)) { CmsGalleryNameMacroResolver resolver = new CmsGalleryNameMacroResolver(m_cms, m_content, m_locale); result = resolver.resolveMacros(resultCandidateWithMacros); return result; } } result = m_cms.readPropertyObject( m_content.getFile(), CmsPropertyDefinition.PROPERTY_TITLE, false).getValue(); return result; } /** * Sets the defaultGalleryNameValue.

* * @param defaultGalleryNameValue the defaultGalleryNameValue to set */ public void setDefaultGalleryNameValue(String defaultGalleryNameValue) { m_defaultGalleryNameValue = defaultGalleryNameValue; } /** * Sets the defaultTitleValue.

* * @param defaultTitleValue the defaultTitleValue to set */ public void setDefaultTitleValue(String defaultTitleValue) { m_defaultTitleValue = defaultTitleValue; } /** * Sets the mappedGalleryNameValue.

* * @param mappedGalleryNameValue the mappedGalleryNameValue to set */ public void setMappedGalleryNameValue(String mappedGalleryNameValue) { m_mappedGalleryNameValue = mappedGalleryNameValue; } /** * Sets the mappedTitleValue.

* * @param mappedTitleValue the mappedTitleValue to set */ public void setMappedTitleValue(String mappedTitleValue) { m_mappedTitleValue = mappedTitleValue; } } /** Mapping name used to indicate that the value should be used for the gallery name. */ public static final String MAPPING_GALLERY_NAME = "galleryName"; /** The solr document type name for xml-contents. */ public static final String TYPE_XMLCONTENT_SOLR = "xmlcontent-solr"; /** The log object for this class. */ private static final Log LOG = CmsLog.getLog(CmsSolrDocumentXmlContent.class); /** * Public constructor.

* * @param name the name for the document type */ public CmsSolrDocumentXmlContent(String name) { super(name); } /** * Collects a list of all possible XPaths for a content definition.

* * @param cms the CMS context to use * @param def the content definition * @param path the path of the given content definition * @param result the set used to collect the XPaths */ public static void collectSchemaXpathsForSimpleValues( CmsObject cms, CmsXmlContentDefinition def, String path, Set result) { List nestedTypes = def.getTypeSequence(); for (I_CmsXmlSchemaType nestedType : nestedTypes) { String subPath = path + "/" + nestedType.getName(); if (nestedType instanceof CmsXmlNestedContentDefinition) { CmsXmlContentDefinition nestedDef = ((CmsXmlNestedContentDefinition)nestedType).getNestedContentDefinition(); collectSchemaXpathsForSimpleValues(cms, nestedDef, subPath, result); } else { result.add(subPath); } } } /** * Extracts the content of a single XML content resource.

* * @param cms the cms context * @param resource the resource * @param index the used index * * @return the extraction result * * @throws CmsException in case reading or unmarshalling the content fails */ public static CmsExtractionResult extractXmlContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { return extractXmlContent(cms, resource, index, null); } /** * Extracts the content of a single XML content resource.

* * @param cms the cms context * @param resource the resource * @param index the used index * @param forceLocale if set, only the content values for the given locale will be extracted * * @return the extraction result * * @throws CmsException in case reading or unmarshalling the content fails */ public static CmsExtractionResult extractXmlContent( CmsObject cms, CmsResource resource, CmsSearchIndex index, Locale forceLocale) throws CmsException { // un-marshal the content CmsFile file = cms.readFile(resource); if (file.getLength() <= 0) { throw new CmsIndexNoContentException( Messages.get().container(Messages.ERR_NO_CONTENT_1, resource.getRootPath())); } A_CmsXmlDocument xmlContent = CmsXmlContentFactory.unmarshal(cms, file); // initialize some variables Map> items = new HashMap>(); Map fieldMappings = new HashMap(); List contentLocales = forceLocale != null ? Collections.singletonList(forceLocale) : xmlContent.getLocales(); Locale resourceLocale = index.getLocaleForResource(cms, resource, contentLocales); LinkedHashMap localeItems = null; // loop over the locales of the content for (Locale locale : contentLocales) { GalleryNameChooser galleryNameChooser = new GalleryNameChooser(cms, xmlContent, locale); localeItems = new LinkedHashMap(); StringBuffer textContent = new StringBuffer(); // store the locales of the content as space separated field // loop over the available element paths of the current content locale List paths = xmlContent.getNames(locale); for (String xpath : paths) { // try to get the value extraction for the current element path String extracted = null; I_CmsXmlContentValue value = xmlContent.getValue(xpath, locale); try { extracted = value.getPlainText(cms); if (CmsStringUtil.isEmptyOrWhitespaceOnly(extracted) && value.isSimpleType()) { // no text value for simple type, so take the string value as item extracted = value.getStringValue(cms); } } catch (Exception e) { // it can happen that a exception is thrown while extracting a single value LOG.warn(Messages.get().container(Messages.LOG_EXTRACT_VALUE_2, xpath, resource), e); } // put the extraction to the items and to the textual content if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) { localeItems.put(xpath, extracted); } if (value.getContentDefinition().getContentHandler().isSearchable(value) && CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) { // value is search-able and the extraction is not empty, so added to the textual content textContent.append(extracted); textContent.append('\n'); } List mappings = xmlContent.getHandler().getMappings(value.getPath()); if (mappings.size() > 0) { // mappings are defined, lets check if we have mappings that interest us for (String mapping : mappings) { if (mapping.startsWith(I_CmsXmlContentHandler.MAPTO_PROPERTY)) { // this is a property mapping String propertyName = mapping.substring(mapping.lastIndexOf(':') + 1); if (CmsPropertyDefinition.PROPERTY_TITLE.equals(propertyName) || CmsPropertyDefinition.PROPERTY_DESCRIPTION.equals(propertyName)) { if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(extracted)) { // search index field names and property names are different ["Title" vs. "title"] String fieldName = null; if (CmsPropertyDefinition.PROPERTY_TITLE.equals(propertyName)) { galleryNameChooser.setMappedTitleValue(extracted); } else { // if field is not title, it must be description fieldName = CmsSearchField.FIELD_DESCRIPTION; fieldMappings.put( CmsSearchFieldConfiguration.getLocaleExtendedName(fieldName, locale) + "_s", extracted); } } } } else if (mapping.equals(MAPPING_GALLERY_NAME)) { galleryNameChooser.setMappedGalleryNameValue(value.getPlainText(cms)); } } } } Set xpaths = Sets.newHashSet(); collectSchemaXpathsForSimpleValues(cms, xmlContent.getContentDefinition(), "", xpaths); for (String xpath : xpaths) { // mappings always are stored with indexes, so we add them to the xpath List mappings = xmlContent.getHandler().getMappings(CmsXmlUtils.createXpath(xpath, 1)); for (String mapping : mappings) { if (mapping.equals(MAPPING_GALLERY_NAME) || mapping.equals( I_CmsXmlContentHandler.MAPTO_PROPERTY + CmsPropertyDefinition.PROPERTY_TITLE)) { String defaultValue = xmlContent.getHandler().getDefault( cms, xmlContent.getFile(), null, xpath, locale); if (mapping.equals(MAPPING_GALLERY_NAME)) { galleryNameChooser.setDefaultGalleryNameValue(defaultValue); } else { galleryNameChooser.setDefaultTitleValue(defaultValue); } } } } final String galleryTitleFieldKey = CmsSearchFieldConfiguration.getLocaleExtendedName( CmsSearchField.FIELD_TITLE_UNSTORED, locale) + "_s"; final String galleryNameValue = galleryNameChooser.getGalleryName(); fieldMappings.put(galleryTitleFieldKey, galleryNameValue); // handle the textual content if (textContent.length() > 0) { // add the textual content with a localized key to the items //String key = CmsSearchFieldConfiguration.getLocaleExtendedName(CmsSearchField.FIELD_CONTENT, locale); //items.put(key, textContent.toString()); // use the default locale of this resource as general text content for the extraction result localeItems.put(I_CmsExtractionResult.ITEM_CONTENT, textContent.toString()); } items.put(locale, localeItems); } // if the content is locale independent, it should have only one content locale, but that should be indexed for all available locales. // TODO: One could think of different indexing behavior, i.e., index only for getDefaultLocales(cms,resource) // But using getAvailableLocales(cms,resource) does not work, because locale-available is set to "en" for all that content. if ((xmlContent instanceof CmsXmlContent) && ((CmsXmlContent)xmlContent).isLocaleIndependent()) { if (forceLocale != null) { items.put(forceLocale, localeItems); } else { for (Locale l : OpenCms.getLocaleManager().getAvailableLocales()) { items.put(l, localeItems); } } } // add the locales that have been indexed for this document as item and return the extraction result // fieldMappings.put(CmsSearchField.FIELD_RESOURCE_LOCALES, locales.toString().trim()); return new CmsExtractionResult(resourceLocale, items, fieldMappings); } /** * @see org.opencms.search.documents.CmsDocumentXmlContent#extractContent(org.opencms.file.CmsObject, org.opencms.file.CmsResource, org.opencms.search.CmsSearchIndex) */ @Override public I_CmsExtractionResult extractContent(CmsObject cms, CmsResource resource, CmsSearchIndex index) throws CmsException { logContentExtraction(resource, index); try { I_CmsExtractionResult result = null; List ex = new ArrayList(); for (CmsResource detailContainers : CmsJspTagContainer.getDetailOnlyResources(cms, resource)) { CmsSolrDocumentContainerPage containerpageExtractor = new CmsSolrDocumentContainerPage(""); String localeTemp = detailContainers.getRootPath(); localeTemp = CmsResource.getParentFolder(localeTemp); localeTemp = CmsResource.getName(localeTemp); localeTemp = localeTemp.substring(0, localeTemp.length() - 1); Locale locale = CmsLocaleManager.getLocale(localeTemp); if (CmsJspTagContainer.useSingleLocaleDetailContainers( OpenCms.getSiteManager().getSiteRoot(resource.getRootPath())) && locale.equals(CmsLocaleManager.getDefaultLocale())) { // in case of single locale detail containers do not force the locale locale = null; } I_CmsExtractionResult containersExtractionResult = containerpageExtractor.extractContent( cms, detailContainers, index, locale); // only use the locales of the resource itself, not the ones of the detail containers page containersExtractionResult.getContentItems().remove(CmsSearchField.FIELD_RESOURCE_LOCALES); ex.add(containersExtractionResult); } result = extractXmlContent(cms, resource, index); result = result.merge(ex); return result; } catch (Throwable t) { throw new CmsIndexException(Messages.get().container(Messages.ERR_TEXT_EXTRACTION_1, resource), t); } } /** * @see org.opencms.search.documents.I_CmsDocumentFactory#getDocumentKeys(java.util.List, java.util.List) */ @Override public List getDocumentKeys(List resourceTypes, List mimeTypes) throws CmsException { if (resourceTypes.contains("*")) { // we need to find all configured XML content types List allTypes = new ArrayList(); for (Iterator i = OpenCms.getResourceManager().getResourceTypes().iterator(); i.hasNext();) { I_CmsResourceType resourceType = i.next(); if ((resourceType instanceof CmsResourceTypeXmlContent) // either we need a configured schema, or another class name (which must then contain an inline schema) && (((CmsResourceTypeXmlContent)resourceType).getConfiguration().containsKey( CmsResourceTypeXmlContent.CONFIGURATION_SCHEMA) || !CmsResourceTypeXmlContent.class.equals(resourceType.getClass()))) { // add the XML content resource type name allTypes.add(resourceType.getTypeName()); } } resourceTypes = allTypes; } return super.getDocumentKeys(resourceTypes, mimeTypes); } /** * Solr index content is stored in multiple languages, so the result is NOT locale dependent.

* * @see org.opencms.search.documents.I_CmsDocumentFactory#isLocaleDependend() */ public boolean isLocaleDependend() { return false; } /** * @see org.opencms.search.documents.I_CmsDocumentFactory#isUsingCache() */ public boolean isUsingCache() { return true; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy