io.uhndata.cards.vocabularies.internal.AbstractNCITIndexer Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package io.uhndata.cards.vocabularies.internal;

import java.io.File;
import java.io.IOException;

import javax.jcr.Node;
import javax.jcr.RepositoryException;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.sling.api.SlingHttpServletRequest;
import org.apache.sling.api.SlingHttpServletResponse;
import org.osgi.service.component.annotations.Reference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.uhndata.cards.vocabularies.spi.VocabularyIndexException;
import io.uhndata.cards.vocabularies.spi.VocabularyIndexer;
import io.uhndata.cards.vocabularies.spi.VocabularyParserUtils;

/**
 * Abstract class specifying a vocabulary ontology indexer specifically for the National Cancer Institute Thesaurus. The
 * class implements methods common to parsers for the NCIT, but omits file-type specific methods. The parsing and node
 * creation process is done as a transaction, meaning that if it fails, then proposed changes saved in storage will not
 * be applied, and the repository will be left in its original state.
 * 
 * The indexer assumes that the resource of the response it is given is a VocabulariesHomepage node under
 * which the Vocabulary node instance should be stored in the Jackrabbit Oak repository as a child.
 * 
 *
 * @version $Id: 23d03aa3d6e68d57be668bacf342b1e428e68e47 $
 */
public abstract class AbstractNCITIndexer implements VocabularyIndexer
{
    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractNCITIndexer.class);

    @Reference
    protected VocabularyParserUtils utils;

    /**
     * Method called by the {@link io.uhndata.cards.vocabularies.VocabularyIndexerServlet} to parse and index a NCIT
     * vocabulary. Specifying the version to index is mandatory. There are two optional parameters.
     * 
     * "localpath" - allows downloading of NCIT from a path relative to the VocabularyIndexerServlet.
     * 
     * 
     * "httppath"- allows downloading of NCIT from a url other than
     * "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/".
     * 
     * Also the following parameter is required if you want to overwrite a vocabulary that already exists in the
     * repository:
     * 
     * overwrite - must be "true" or else overwritting is not permitted and a
     * {@link io.uhndata.cards.vocabularies.spi.VocabularyIndexException} is thrown.
     * 
     * You cannot create a vocabulary with the same identifier as an existing vocabulary unless you overwrite it.
     *
     * @param request http request from {@link io.uhndata.cards.vocabularies.VocabularyIndexerServlet}
     * @param response http response from {@link io.uhndata.cards.vocabularies.VocabularyIndexerServlet}
     * @throws IOException thrown when response Json cannot be written
     */
    @Override
    public void index(final String source, final SlingHttpServletRequest request,
        final SlingHttpServletResponse response)
        throws IOException, VocabularyIndexException
    {
        // Obtain relevant request parameters.
        String identifier = StringUtils.defaultIfBlank(request.getParameter("identifier"), "ncit");
        String version = request.getParameter("version");
        String httppath = request.getParameter("httppath");
        String localpath = request.getParameter("localpath");
        String overwrite = request.getParameter("overwrite");

        // Obtain the resource of the request and adapt it to a JCR node. This must be the /Vocabularies homepage node.
        Node homepage = request.getResource().adaptTo(Node.class);

        final File temporaryFile = File.createTempFile(identifier, "");
        try {
            // Throw exceptions if mandatory parameters are not found or if homepage node cannot be found
            if (version == null) {
                throw new VocabularyIndexException("Mandatory version parameter not provided.");
            }

            if (homepage == null) {
                throw new VocabularyIndexException("Could not access resource of your request.");
            }

            // Delete the Vocabulary node already representing this vocabulary instance if it exists
            this.utils.clearVocabularyNode(homepage, identifier, overwrite);

            // Load temporary NCIT zip file. Default location is at https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/
            String sourceLocation = getDefaultSource(version);

            VocabularyZipLoader zipLoader = new VocabularyZipLoader();
            if (localpath != null) {
                sourceLocation = localpath;
                zipLoader.loadZipLocal(localpath, temporaryFile);
            } else if (httppath != null) {
                sourceLocation = httppath;
                zipLoader.loadZipHttp(httppath, temporaryFile);
            } else {
                zipLoader.loadZipHttp(sourceLocation, temporaryFile);
            }

            // Create a new Vocabulary node instance representing this vocabulary instance
            String name = "National Cancer Institute Thesaurus";
            Node vocabularyNode = createNCITVocabularyNode(homepage, identifier, name, sourceLocation, version);

            // Parse the NCIT zip file and create VocabularyTerm node children
            parseNCIT(temporaryFile, vocabularyNode);

            /*
             * Save the JCR session. If any errors occur before this step, all proposed changes will not be applied and
             * the repository will remain in its original state. Lucene indexing is automatically performed by the
             * Jackrabbit Oak repository when this is performed.
             */
            saveSession(homepage);

            // Success response json
            this.utils.writeStatusJson(request, response, true, null);
        } catch (Exception e) {
            // If parsing fails, return an error json with the exception message
            this.utils.writeStatusJson(request, response, false, "NCIT Flat indexing error: " + e.getMessage());
            LOGGER.error("NCIT indexing error: {}", e.getMessage(), e);
        } finally {
            // Delete temporary source file
            FileUtils.deleteQuietly(temporaryFile);
        }
    }

    /**
     * Creates a Vocabulary node that represents the current vocabulary instance with the identifier. as
     * the name of the node. The vocabulary property website is currently fixed to
     * https://ncit.nci.nih.gov/ncitbrowser/.
     *
     * @param homepage VocabulariesHomepage node instance that will be parent of the new vocabulary node
     * @param identifier short unique identifier of the vocabulary
     * @param name the official name of the vocabulary
     * @param source source of the vocabulary, usually a URL
     * @param version the version of the vocabulary, a short string
     * @return the Vocabulary node that was created
     * @throws VocabularyIndexException when node cannot be created
     */
    private Node createNCITVocabularyNode(Node homepage, String identifier, String name, String source, String version)
        throws VocabularyIndexException
    {
        try {
            Node vocabularyNode = homepage.addNode("./" + identifier, "cards:Vocabulary");
            vocabularyNode.setProperty("identifier", identifier);
            vocabularyNode.setProperty("name", name);
            vocabularyNode.setProperty("source", source);
            vocabularyNode.setProperty("version", version);
            vocabularyNode.setProperty("website", "https://ncit.nci.nih.gov/ncitbrowser/");
            return vocabularyNode;
        } catch (RepositoryException e) {
            String message = "Failed to create Vocabulary node: " + e.getMessage();
            throw new VocabularyIndexException(message, e);
        }
    }

    /**
     * Creates a VocabularyTerm node representing an individual term of the NCIT. This method is protected
     * to allow subclass implementations of {@link parseNCIT} to use this method, allowing the node creation process to
     * be standardized.
     * 
     * Note that if the label does not exist, then the first synonym that exists is used instead for the label.
     * 
     *
     * @param vocabularyNode the parent Vocabulary node
     * @param identifier short identifier code for the term
     * @param label long-form name for the term
     * @param description longer definition or description of the term
     * @param synonyms synonyms for this the term
     * @param parents the parent terms (direct ancestors) of the given term, as a list of identifiers
     * @param ancestors ancestor terms of the given term, as a list of identifiers
     * @throws VocabularyIndexException when node cannot be created
     */
    protected void createNCITVocabularyTermNode(Node vocabularyNode, String identifier, String label,
        String description, String[] synonyms, String[] parents, String[] ancestors)
        throws VocabularyIndexException
    {
        try {
            Node vocabularyTermNode = vocabularyNode.addNode("./" + identifier, "cards:VocabularyTerm");
            vocabularyTermNode.setProperty("identifier", identifier);

            // If the label does not exist, use the first synonym that is listed
            // In the impossible case that there are no synonyms, use a blank String
            String defaultLabel = synonyms != null && synonyms.length > 0 ? synonyms[0] : "";
            String safeLabel = StringUtils.defaultIfBlank(label, defaultLabel);
            vocabularyTermNode.setProperty("label", safeLabel);

            vocabularyTermNode.setProperty("description", description);
            vocabularyTermNode.setProperty("synonyms", synonyms);
            vocabularyTermNode.setProperty("parents", parents);
            vocabularyTermNode.setProperty("ancestors", ancestors);
        } catch (RepositoryException e) {
            // If the identifier exists, print the identifier in the error message to identify node
            String message =
                "Failed to create VocabularyTerm node " + StringUtils.defaultString(identifier) + ": " + e.getMessage();
            throw new VocabularyIndexException(message, e);
        }
    }

    /**
     * Saves the JCR session of the homepage node that was obtained from the resource of the request. If this is
     * successful, then the changes made already will be applied to the JCR repository. If not, then all of the changes
     * will not be applied. After the session is saved, then the JCR repository will automatically begin Lucene
     * indexing.
     *
     * @param vocabulariesHomepage the VocabulariesHomepage node obtained from the request
     * @throws VocabularyIndexException if session is not successfully saved
     */
    private void saveSession(Node vocabulariesHomepage)
        throws VocabularyIndexException
    {
        try {
            vocabulariesHomepage.getSession().save();
        } catch (RepositoryException e) {
            String message = "Failed to save session: " + e.getMessage();
            throw new VocabularyIndexException(message, e);
        }
    }

    /**
     * Parses the temporary NCIT source file and creates VocabularyTerm nodes for each term. The new term
     * nodes must be children of the given Vocabulary node representing the NCIT vocabulary instance.
     *
     * @param sourceFile the source file for the vocabulary
     * @param vocabularyNode Vocabulary node representing the NCIT vocabulary where new terms will be added
     * @throws VocabularyIndexException when an error occurs with parsing
     */
    protected abstract void parseNCIT(File sourceFile, Node vocabularyNode) throws VocabularyIndexException;

    /**
     * Returns the default source from which to obtain the NCIT zip file. This is an abstract method as individual
     * subclasses will implement their own default sources.
     *
     * @param version the version of NCIT wanted, must be an available version
     */
    abstract String getDefaultSource(String version);
}