it.uniroma2.art.semanticturkey.extension.impl.rdflifter.spreadsheetdeserializer.SpreadsheetDeserializingLifter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of st-spreadsheet-deserializing-lifter Show documentation
The newest version!
package it.uniroma2.art.semanticturkey.extension.impl.rdflifter.spreadsheetdeserializer;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.*;
import org.eclipse.rdf4j.model.*;
import org.eclipse.rdf4j.model.impl.LinkedHashModel;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.eclipse.rdf4j.model.vocabulary.SKOS;
import org.eclipse.rdf4j.model.vocabulary.SKOSXL;
import org.eclipse.rdf4j.rio.RDFHandler;

import it.uniroma2.art.semanticturkey.extension.extpts.rdflifter.LifterContext;
import it.uniroma2.art.semanticturkey.extension.extpts.rdflifter.LiftingException;
import it.uniroma2.art.semanticturkey.extension.extpts.rdflifter.RDFLifter;
import it.uniroma2.art.semanticturkey.extension.extpts.reformattingexporter.ClosableFormattedResource;
import org.eclipse.rdf4j.rio.helpers.NTriplesUtil;

/**
 * An {@link RDFLifter} that deserializes RDF data from a spreadsheet created by the provided exporter in ST
 * 
 * @author Andrea Turbati
 */
public class SpreadsheetDeserializingLifter implements RDFLifter {

	private final Map prefixToNamespaceMap = new HashMap<>();

	// used to keep track che property name for a given column number (and if such property should be used in a
	// reified construct, meaning tha the subject of such property is not the main element of the row but the element
	// of the previous cell)
	private final Map posToPropertyNameMap = new HashMap<>();

	@Override
	public void lift(ClosableFormattedResource sourceFormattedResource, String format,
			RDFHandler targetRDFHandler, LifterContext lifterContext) throws LiftingException, IOException {

		//try {
		//this model will contain all the triples extracted from the spreadsheet
		Model model = new LinkedHashModel();

		Workbook workbook;
		try (InputStream is = sourceFormattedResource.getInputStream()) {
			workbook = WorkbookFactory.create(is);
		}
		//read the second sheet, containing the prefix-mapping
		Sheet sheet2 = workbook.getSheetAt(1);
		readPrefixed(sheet2);

		// now focus on the first sheet
		Sheet sheet = workbook.getSheetAt(0);

		// read the excel file and process the Concepts, Schemes and Collections part, use the separator to
		// identify the different parts:
		// - concept hierarchy / scheme URI / collection hierarchy
		// - lexicalizations
		// - types
		// - reified notes (this part can be empty)
		// - generic properties

		//start with the headers
		// all these "start_*" values represent the colPos where the "::" are, so a pos before the actual stating
		// of the part
		int start_col_lexicalization=0, start_col_type=0, start_col_reifiedNote=0, start_pos_genProp=0, afterLastCol;
		int posCol = 0;
		Row row = sheet.getRow(0);
		while(true){
			Cell cell = row.getCell(++posCol);
			//String cellTextx = row.getCell(++posCol).getStringCellValue();
			if( (cell == null || cell.getStringCellValue().isEmpty()) && start_pos_genProp!=0){
				afterLastCol = posCol;
				break;
			}
			if(cell == null) {
				continue;
			}
			String cellValue = cell.getStringCellValue().trim();
			if (cellValue.equals("::")) {
				//found a separator
				if (start_col_lexicalization==0) {
					start_col_lexicalization = posCol;
				} else if (start_col_type==0) {
					start_col_type = posCol;
				} else if (start_col_reifiedNote==0) {
					start_col_reifiedNote = posCol;
				} else { // start_pos_genProp==0
					start_pos_genProp = posCol;
				}
			} else if (start_col_lexicalization!=0){
				// the concept hierarchy has been passed, so now the headers are about the properties names
				// (and reified used properties)
				if (start_pos_genProp!=0) {
					//we are in the generic property part, so no reified values
					PropertyName propertyName = new PropertyName(cellValue, false);
					posToPropertyNameMap.put(posCol, propertyName);
				} else if (start_col_reifiedNote!=0){
					// we are in the reified note, so the rdf:value is a reified for the previous column
					IRI prop = toIRI(cellValue);
					PropertyName propertyName;
					if(prop.equals(RDF.VALUE)){
						propertyName = new PropertyName(cellValue, true);
					} else {
						propertyName = new PropertyName(cellValue, false);
					}
					posToPropertyNameMap.put(posCol, propertyName);
				} else if (start_col_type!=0){
					// we are in the type part, no reified properties
					PropertyName propertyName = new PropertyName(cellValue, false);
					posToPropertyNameMap.put(posCol, propertyName);
				} else {
					//we are in the lexicalizations, if a skosxl:literalForm is found, then it is reified
					IRI prop = toIRI(cellValue);
					PropertyName propertyName;
					if(prop.equals(SKOSXL.LITERAL_FORM)){
						propertyName = new PropertyName(cellValue, true);
					} else {
						propertyName = new PropertyName(cellValue, false);
					}
					posToPropertyNameMap.put(posCol, propertyName);
				}
			}
		}

		int posRow = 0;
		//Concepts part (CONCEPTS HIERARCHY)
		posRow = processConceptSection(sheet, posRow, start_col_lexicalization, start_col_type, start_col_reifiedNote,
				start_pos_genProp, afterLastCol, model);

		//Schemes part (CONCEPT SCHEMES)
		posRow = processSchemeSection(sheet, ++posRow, start_col_lexicalization, start_col_type, start_col_reifiedNote,
				start_pos_genProp, afterLastCol, model);

		//Collection part (COLLECTIONS HIERARCHY)
		processCollectionSection(sheet, ++posRow, start_col_lexicalization, start_col_type, start_col_reifiedNote,
				start_pos_genProp, afterLastCol, model);


		for (Statement stmt : model) {
			targetRDFHandler.handleStatement(stmt);
		}

		// the InvalidFormatException is not throw anymore in org.apache.poi:poi-ooxml 4.0.1
		// it was thrown before in org.apache.poi:poi-ooxml 3.10-FINAL
		/*}
		catch (InvalidFormatException e) {
			throw new LiftingException(e);
		}*/
	}


	private int processConceptSection(Sheet sheet, int posRow, int start_col_lexicalization, int start_col_type,
									  int start_col_reifiedNote, int start_pos_genProp, int afterLastCol, Model model) {
		Row row;
		//iterate over each row and the first not empty/null cell found is the URI (or qname) of the concept
		while(true){
			//iterate over each row
			String concept = null;
			boolean foundData = false;
			//get the concept IRI/qname
			row = sheet.getRow(++posRow);

			//a analyze each each cell of this row (by using the divisions of the various parts)

			// search for the concept IRI/qname
			if(row == null){
				// it is an empty row, so this section is terminated
				break;
			}
			for(int posCol=0; posCol memberOfOrdCollList = new ArrayList<>();
				int posRowMember = posRow;
				int posColMember = posCol+1;
				// a list is present, so take all the element under it
				while(true){
					Row rowMember = sheet.getRow(++posRowMember);
					if(rowMember==null){
						// no more row to consider
						break;
					}
					Cell cellSameCol = rowMember.getCell(posCol);
					if(cellSameCol!=null && !cellSameCol.getStringCellValue().isEmpty()){
						// this row contains a resource that is at the same level of the ordered collection
						break;
					}
					Cell cellMember = rowMember.getCell(posColMember);
					if(cellMember==null || cellMember.getStringCellValue().isEmpty()) {
						// this row does not contain a member of this ordered collection
						break;
					}
					// the cellMember contains a resource that belong to the ordered collection
					memberOfOrdCollList.add((Resource) createValue(cellMember.getStringCellValue()));
				}
				// now create the RDF list containing all the resources
				model.add((Resource) createValue(collection), SKOS.MEMBER_LIST, listRes);

				Resource prevList = listRes;
				Resource currentList = prevList;
				for(Resource first : memberOfOrdCollList){
					if(currentList == null){
						currentList = SimpleValueFactory.getInstance().createBNode();
						model.add(prevList,RDF.REST, currentList);
					}
					model.add(currentList, RDF.TYPE, RDF.LIST);
					model.add(currentList, RDF.FIRST, first);
					//update the currentList and prefList
					prevList = currentList;
					currentList = null;
				}
				//add the RDF.NULL
				model.add(prevList, RDF.REST, RDF.NIL);
			}
		}
		return posRow;
	}

	/********************************************************/
	private void processLexicalizationCol(Row row, int start_col_lexicalization, int start_col_type,
										  Resource resource, Model model) {
		processPossibleReifiedCol(row, start_col_lexicalization, start_col_type, resource, model, SKOSXL.LABEL);
	}

	private void processTypeCol(Row row, int start_col_type, int start_col_reifiedNote, Resource resource, Model model){
		processNotReifiedCol(row, start_col_type, start_col_reifiedNote, resource, model);
	}

	private void processReiNoteCol(Row row, int start_col_reifiedNote, int start_pos_genProp, Resource resource, Model model){
		processPossibleReifiedCol(row, start_col_reifiedNote, start_pos_genProp, resource, model, null);
	}

	private void processGenPropCol(Row row, int start_pos_genProp, int afterLastCol, Resource resource, Model model){
		processNotReifiedCol(row, start_pos_genProp, afterLastCol, resource, model);
	}

	private void processNotReifiedCol(Row row, int start, int end, Resource resource, Model model){
		for(int i=start+1; i"));
			return SimpleValueFactory.getInstance().createIRI(iriString);
		}
		//the qnameOrIri is a quname, so extract the prefix and local name
		String prefix = qnameOrIri.split(":")[0].trim();
		String localname = qnameOrIri.split(":")[1].trim();
		String namespace = prefixToNamespaceMap.get(prefix);
		return SimpleValueFactory.getInstance().createIRI(namespace+localname);
	}

	private Value createValue(String value) {
		// manage the qname case
		if(isQName(value)){
			return createIRI(value);
		}
		// be careful of the case of literal with a datatype being a qname
		if(value.startsWith("\"")){
			String datatype = value.substring(value.lastIndexOf("^")+1); // TODO test it
			if(isQName(datatype)){
				value = value.substring(0, value.lastIndexOf("^")) +
						NTriplesUtil.toNTriplesString(createIRI(datatype));
			}
		}
		return NTriplesUtil.parseValue(value, SimpleValueFactory.getInstance());
	}

	private boolean isQName(String qname){
		if(!qname.startsWith("<") && !qname.startsWith("\"") && !qname.startsWith("_:") &&
				qname.contains(":")) {
			return true;
		}
		return false;

	}



	public class PropertyName {
		private IRI propertyIri;
		boolean isFromReified;

		public PropertyName(String propertyQNameOrIRI, boolean isFromReified) {
			// this property can have at the end the @LANG_TAG, so it should be removed
			if(propertyQNameOrIRI.contains("@")) {
				propertyQNameOrIRI = propertyQNameOrIRI.substring(0, propertyQNameOrIRI.lastIndexOf("@"));
			}
			this.propertyIri = createIRI(propertyQNameOrIRI);
			this.isFromReified = isFromReified;
		}

		public IRI getPropertyIri() {
			return propertyIri;
		}

		public boolean isFromReified() {
			return isFromReified;
		}
	}
}