Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package io.uhndata.cards.vocabularies.internal;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import javax.jcr.Node;
import javax.jcr.RepositoryException;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.osgi.service.component.annotations.Component;
import org.osgi.service.component.annotations.Reference;
import io.uhndata.cards.vocabularies.spi.VocabularyIndexException;
import io.uhndata.cards.vocabularies.spi.VocabularyIndexer;
import io.uhndata.cards.vocabularies.spi.VocabularyParserUtils;
/**
* Concrete subclass of {@link AbstractNCITIndexer} for indexing NCIT in flat file form.
*
* @version $Id: fb575b13bb65b9572c590f76b0b05710a9cdae69 $
*/
@Component(
service = VocabularyIndexer.class,
name = "VocabularyIndexer.ncit-flat",
reference = { @Reference(field = "utils", name = "utils", service = VocabularyParserUtils.class) })
public class NCITFlatIndexer extends AbstractNCITIndexer
{
// Column numbers of the properties we want to extract.
private static final int IDENTIFIER_COLUMN = 0;
private static final int PARENTS_COLUMN = 2;
private static final int SYNONYMS_COLUMN = 3;
private static final int DESCRIPTION_COLUMN = 4;
private static final int LABEL_COLUMN = 5;
// Charset to use when reading the source file
private static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8;
/** An empty String[] array to use for {@code Set.toArray}, we don't want to create a new array for each call. */
private static final String[] EMPTY_STRING_ARRAY = new String[0];
@Override
public boolean canIndex(String source)
{
return "ncit-flat".equals(source);
}
@Override
String getDefaultSource(String version)
{
return "https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Thesaurus_" + version + ".FLAT.zip";
}
@Override
protected void parseNCIT(final File source, final Node vocabularyNode) throws VocabularyIndexException
{
try {
// Extracts term parents the flat file and returns a map of (term, parents) pairs
Map parentsMap = returnParentMap(source);
// Extracts all other properties and creates VocabularyTerm nodes based on them
createTermNodes(source, parentsMap, vocabularyNode);
} catch (RepositoryException e) {
String message = "Failed to access Vocabulary node: " + e.getMessage();
throw new VocabularyIndexException(message, e);
} catch (IOException e) {
String message = "Failed to read flat file: " + e.getMessage();
throw new VocabularyIndexException(message, e);
}
}
/**
* Extracts all VocabularyTerm node properties from the NCIT flat file zip and creates JCR nodes.
* Ancestors of terms are calculated recursively from the parents.
*
* @param parentsMap a map of (term, parents) pairs where "parents" is an array of parent terms
* @param vocabularyNode the Vocabulary node which represents the current NCIT instance to index
* @throws IOException thrown when file input cannot be read
* @throws RepositoryException thrown when JCR nodes cannot be created
* @throws VocabularyIndexException throw on failure to create appropriate JCR node
*/
private void createTermNodes(final File source, Map parentsMap, Node vocabularyNode)
throws IOException, RepositoryException, VocabularyIndexException
{
// The NCIT source is an unquoted tab-delimited file
// We need withQuote(null) to keep all quotes as part of the text, and not interpreted as special chars
try (CSVParser csvParser = CSVParser.parse(source, DEFAULT_CHARSET,
CSVFormat.TDF.builder().setQuote(null).build())) {
Iterator csvIterator = csvParser.iterator();
while (csvIterator.hasNext()) {
CSVRecord row = csvIterator.next();
String identifier = row.get(IDENTIFIER_COLUMN);
String description = row.get(DESCRIPTION_COLUMN);
String synonymString = row.get(SYNONYMS_COLUMN);
// Synonym entry is a String with terms separated by "|" so split the String into a String[]
String[] synonymsArray = synonymString.split("\\|");
/*
* If this doesn't exist, the first synonym will be used. If there are no synonyms, a blank String will
* be used. This is handled in createNCITVocabularyTermNode.
*/
String label = row.get(LABEL_COLUMN);
String[] parentsArray = parentsMap.get(identifier);
/*
* Ancestors are recursively calculated from parents. Since computeAncestors returns the ancestors as a
* Set, it must be converted to a String[] here.
*/
String[] ancestorsArray = computeAncestors(parentsMap, identifier).toArray(EMPTY_STRING_ARRAY);
// This method is a protected method in AbstractNCITIndexer for creating VocabularyTerm nodes
createNCITVocabularyTermNode(vocabularyNode, identifier, label, description, synonymsArray,
parentsArray, ancestorsArray);
}
}
}
/**
* Returns a HashMap containing a (String, String[]) pair representing (term, parents). This is extracted from the
* temporary NCIT zip flat file.
*
* @return a map which stores (term ID, parent IDs) pairs
* @throws IOException thrown when temporary NCIT file cannot be read
*/
private Map returnParentMap(final File source)
throws IOException
{
// The NCIT source is an unquoted tab-delimited file
// We need withQuote(null) to keep all quotes as part of the text, and not interpreted as special chars
try (CSVParser csvParser = CSVParser.parse(source, DEFAULT_CHARSET,
CSVFormat.TDF.builder().setQuote(null).build())) {
Iterator csvIterator = csvParser.iterator();
Map parents = new HashMap<>();
while (csvIterator.hasNext()) {
CSVRecord row = csvIterator.next();
String identifier = row.get(IDENTIFIER_COLUMN);
String parentString = row.get(PARENTS_COLUMN);
// Parent entry is a String with terms separated by "|" so split the String into a String[]
String[] parentArray = parentString.split("\\|");
// Put a blank array if there are no parents
if (parentArray[0].contentEquals("")) {
parents.put(identifier, EMPTY_STRING_ARRAY);
} else {
parents.put(identifier, parentArray);
}
}
return parents;
}
}
/**
* Recursively calculates the ancestors of a given term, using the map of parents to accumulate a list of ancestors.
* This is returned as a String set.
*
* @param parentsMap a map of (term, parents) pairs where "parents" is an array of parent terms
* @param identifier identifier of the term
* @return a set which stores the ancestors of the given term
*/
private Set computeAncestors(Map parentsMap, String identifier)
{
// A set is used instead of an array so that the size need not be initially specified.
Set termAncestorSet = new HashSet<>();
String[] parentArray = parentsMap.get(identifier);
if (parentArray != null) {
for (String parentIdentifier : parentArray) {
// Add parent as an ancestor
termAncestorSet.add(parentIdentifier);
// Add recursively calculated ancestors of parent
termAncestorSet.addAll(computeAncestors(parentsMap, parentIdentifier));
}
}
return termAncestorSet;
}
}