All Downloads are FREE. Search and download functionalities are using the official Maven repository.

home.harvest.workflows.dataset-rules.py Maven / Gradle / Ivy

There is a newer version: 1.10.2
Show newest version
import time
from com.googlecode.fascinator.api.storage import StorageException
from com.googlecode.fascinator.common import JsonSimple
from com.googlecode.fascinator.common.storage import StorageUtils
from java.text import SimpleDateFormat
from java.util import HashSet, HashMap, Date
from org.apache.commons.io import IOUtils
from org.joda.time import DateTime

class IndexData:
    def __activate__(self, context):
        # Prepare variables
        self.index = context["fields"]
        self.object = context["object"]
        self.payload = context["payload"]
        self.params = context["params"]
        self.utils = context["pyUtils"]
        self.config = context["jsonConfig"]
        self.log = context["log"]
        self.last_modified = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
        self.log.debug("Indexing Metadata Record '{}' '{}'", self.object.getId(), self.payload.getId())

        # Common data
        self.__newDoc()
        self.packagePid = None
        pidList = self.object.getPayloadIdList()
        for pid in pidList:
            if pid.endswith(".tfpackage"):
                self.packagePid = pid

        # Real metadata
        if self.itemType == "object":
            self.__basicData()
            self.__metadata()
            # Some of the above steps may request some
            #  messages be sent, particularly workflows
            self.__messages()

        # Make sure security comes after workflows
        self.__security()

    def __newDoc(self):
        self.oid = self.object.getId()
        self.pid = self.payload.getId()
        metadataPid = self.params.getProperty("metaPid", "DC")

        self.utils.add(self.index, "storage_id", self.oid)
        if self.pid == metadataPid:
            self.itemType = "object"
        else:
            self.oid += "/" + self.pid
            self.itemType = "datastream"
            self.utils.add(self.index, "identifier", self.pid)

        self.utils.add(self.index, "id", self.oid)
        self.utils.add(self.index, "item_type", self.itemType)
        self.utils.add(self.index, "last_modified", self.last_modified)
        self.utils.add(self.index, "harvest_config", self.params.getProperty("jsonConfigOid"))
        self.utils.add(self.index, "harvest_rules",  self.params.getProperty("rulesOid"))

        self.item_security = []
        self.owner = self.params.getProperty("owner", "guest")

        ## We have incorrect timezones ending in 'Z' and datetimes with timezone indicator (using +/-, which breaks solr queries)
        # Strip the current timezone/UTC designator from datetime and add Z as workaround for solr queries using now()-10
        # TODO: fix this hack in next release
        current_date_object_created = self.params.getProperty("date_object_created")
        self.log.debug("Current date object created is: %s" % current_date_object_created)
        formatterWithoutTZPattern = "yyyy-MM-dd'T'HH':'mm':'ss"
        if str(current_date_object_created).endswith("Z"):
            self.log.debug("Attempting to parse existing solr date created with UTC format..")
            # don't convert timezone, just strip the 'Z' as workaround for now
            formatterWithTZ = SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ")
            formattedTZCreatedDateTime = formatterWithTZ.parse(current_date_object_created)
            self.log.debug("parsed 'date_object_created' is: %s" % formattedTZCreatedDateTime)
            formatterWithoutTZ = SimpleDateFormat(formatterWithoutTZPattern)
            dateTimeWithoutTZ = formatterWithoutTZ.format(formattedTZCreatedDateTime)
            self.log.debug("timezone-stripped date time is: %s" % dateTimeWithoutTZ)
        else:
            self.log.debug("Attempting to parse existing solr date created, presuming an ISO timezone format..")
            # Too many various formats to predict, joda's datetime can ingest various ISO formats assuming timezone has been constant
            formattedTZCreatedDateTime = DateTime(current_date_object_created)
            self.log.debug("parsed 'date_object_created' is: %s" % formattedTZCreatedDateTime.toString())
            dateTimeWithoutTZ =formattedTZCreatedDateTime.toString(formatterWithoutTZPattern)
            self.log.debug("timezone-stripped date time is: %s" % dateTimeWithoutTZ)
        formattedCreatedDateTime = dateTimeWithoutTZ + "Z"
        self.log.debug("'date_object_created' will be: %s" % formattedCreatedDateTime)
        self.utils.add(self.index, "date_object_created", formattedCreatedDateTime)

        formatter = SimpleDateFormat('yyyyMMddHHmmss')
        last_modified = formatter.format(Date())
        self.params.setProperty("last_modified", last_modified)
        self.log.debug("'last_modified' will be: %s" % str(last_modified))
        formattedModifiedDateTime = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime())
        self.log.debug("'date_object_modified' will be: %s" % str(formattedModifiedDateTime))
        self.params.setProperty("date_object_modified", formattedModifiedDateTime)
        self.utils.add(self.index, "date_object_modified",  self.params.getProperty("date_object_modified"))

    def __basicData(self):
        self.utils.add(self.index, "repository_name", self.params["repository.name"])
        self.utils.add(self.index, "repository_type", self.params["repository.type"])
        if self.params["date_transitioned"] is not None:
            self.utils.add(self.index, "date_transitioned", self.params["date_transitioned"])
        # VITAL integration
        vitalPid = self.params["vitalPid"]
        if vitalPid is not None:
            self.utils.add(self.index, "vitalPid", vitalPid)
        # Persistent Identifiers
        pidProperty = self.config.getString(None, ["curation", "pidProperty"])
        if pidProperty is None:
            self.log.error("No configuration found for persistent IDs!")
        else:
            pid = self.params[pidProperty]
            if pid is not None:
                self.utils.add(self.index, "known_ids", pid)
                self.utils.add(self.index, "pidProperty", pid)
                self.utils.add(self.index, "oai_identifier", pid)
        self.utils.add(self.index, "oai_set", "default")
        # Publication
        published = self.params["published"]
        if published is not None:
            self.utils.add(self.index, "published", "true")

    def __security(self):
        # Security
        roles = self.utils.getRolesWithAccess(self.oid)
        if roles is not None:
            # For every role currently with access
            for role in roles:
                # Should show up, but during debugging we got a few
                if role != "":
                    if role in self.item_security:
                        # They still have access
                        self.utils.add(self.index, "security_filter", role)
                    else:
                        # Their access has been revoked
                        self.__revokeRoleAccess(role)
            # Now for every role that the new step allows access
            for role in self.item_security:
                if role not in roles:
                    # Grant access if new
                    self.__grantRoleAccess(role)
                    self.utils.add(self.index, "security_filter", role)

        # No existing security
        else:
            if self.item_security is None:
                # Guest access if none provided so far
                self.__grantRoleAccess("guest")
                self.utils.add(self.index, "security_filter", role)
            else:
                # Otherwise use workflow security
                for role in self.item_security:
                    # Grant access if new
                    self.__grantRoleAccess(role)
                    self.utils.add(self.index, "security_filter", role)

        users = self.utils.getUsersWithAccess(self.oid)
        if users is not None:
            # For every role currently with access
            for user in users:
                self.utils.add(self.index, "security_exception", user)

        # Ownership
        if self.owner is None:
            self.utils.add(self.index, "owner", "system")
        else:
            self.utils.add(self.index, "owner", self.owner)

    def __indexList(self, name, values):
        # convert to set so no duplicate values
        for value in HashSet(values):
            self.utils.add(self.index, name, value)

    def __grantRoleAccess(self, newRole):
        schema = self.utils.getAccessSchema();
        schema.setRecordId(self.oid)
        schema.set("role", newRole)
        self.utils.setAccessSchema(schema)

    def __grantUserAccess(self, newUser):
        schema = self.utils.getAccessSchema();
        schema.setRecordId(self.oid)
        schema.set("user", newUser)
        self.utils.setAccessSchema(schema)

    def __revokeRoleAccess(self, oldRole):
        schema = self.utils.getAccessSchema();
        schema.setRecordId(self.oid)
        schema.set("role", oldRole)
        self.utils.removeAccessSchema(schema)

    def __revokeUserAccess(self, oldUser):
        schema = self.utils.getAccessSchema();
        schema.setRecordId(self.oid)
        schema.set("user", oldUser)
        self.utils.removeAccessSchema(schema)

    def __metadata(self):
        self.title = None
        self.dcType = None
        self.descriptionList = []
        self.creatorList = []
        self.creationDate = []
        self.contributorList = []
        self.approverList = []
        self.formatList = ["application/x-fascinator-package"]
        self.fulltext = []
        self.relationDict = {}
        self.customFields = {}
        self.creatorFullNameMap = HashMap()
        self.grantNumberList = []
        self.arrayBucket = HashMap()
        self.compFields = ["dc:coverage.vivo:DateTimeInterval", "locrel:prc.foaf:Person"]
        self.compFieldsConfig = {"dc:coverage.vivo:DateTimeInterval":{"delim":" to ","start":"start","end":"end"},"locrel:prc.foaf:Person":{"delim":", ","start":"familyName","end":"givenName"} }
        self.reportingFieldPrefix = "reporting_"
        self.embargoedDate = None
        self.createTimeStamp = None

        # Try our data sources, order matters
        self.__workflow()

        # Some defaults if the above failed
        if self.title is None:
           self.title = "New Dataset"
        if self.formatList == []:
            source = self.object.getPayload(self.packagePid)
            self.formatList.append(source.getContentType())

        # Index our metadata finally
        self.utils.add(self.index, "dc_title", self.title)
        if self.dcType is not None:
            self.utils.add(self.index, "dc_type", self.dcType)
        self.__indexList("dc_creator", self.creatorList)  #no dc_author in schema.xml, need to check
        self.__indexList("dc_contributor", self.contributorList)
        self.__indexList("dc_description", self.descriptionList)
        self.__indexList("dc_format", self.formatList)
        self.__indexList("dc_date", self.creationDate)
        self.__indexList("full_text", self.fulltext)
        for key in self.customFields:
            self.__indexList(key, self.customFields[key])
        for key in self.relationDict:
            self.__indexList(key, self.relationDict[key])
        if self.arrayBucket.size() > 0:
            for arrFldName in self.arrayBucket.keySet():
                if arrFldName.endswith("Person") or arrFldName.replace(self.reportingFieldPrefix, "") in self.compFields:
                    self.__indexList(arrFldName, self.arrayBucket.get(arrFldName).values())
                else:
                    self.__indexList(arrFldName, self.arrayBucket.get(arrFldName))
        if self.embargoedDate is not None:
            self.utils.add(self.index, "date_embargoed", self.embargoedDate+"T00:00:00Z")
        if self.createTimeStamp is None:
            self.utils.add(self.index, "create_timestamp", time.strftime("%Y-%m-%dT%H:%M:%SZ", time.localtime()))
    def __workflow(self):
        # Workflow data
        WORKFLOW_ID = "dataset"
        wfChanged = False
        workflow_security = []
        self.message_list = None
        stages = self.config.getJsonSimpleList(["stages"])
        if self.owner == "guest":
            pageTitle = "Submission Request"
            displayType = "submission-request"
            initialStep = 0
        else:
            pageTitle = "Metadata Record"
            displayType = "package-dataset"
            initialStep = 1
        try:
            wfMeta = self.__getJsonPayload("workflow.metadata")
            wfMeta.getJsonObject().put("pageTitle", pageTitle)

            # Are we indexing because of a workflow progression?
            targetStep = wfMeta.getString(None, ["targetStep"])
            if targetStep is not None and targetStep != wfMeta.getString(None, ["step"]):
                wfChanged = True
                # Step change
                wfMeta.getJsonObject().put("step", targetStep)
                wfMeta.getJsonObject().remove("targetStep")
            # This must be a re-index then
            else:
                targetStep = wfMeta.getString(None, ["step"])

            # Security change
            for stage in stages:
                if stage.getString(None, ["name"]) == targetStep:
                    wfMeta.getJsonObject().put("label", stage.getString(None, ["label"]))
                    self.item_security = stage.getStringList(["visibility"])
                    workflow_security = stage.getStringList(["security"])
                    if wfChanged == True:
                        self.message_list = stage.getStringList(["message"])
        except StorageException:
            # No workflow payload, time to create
            initialStage = stages.get(initialStep).getString(None, ["name"])
            wfChanged = True
            wfMeta = JsonSimple()
            wfMetaObj = wfMeta.getJsonObject()
            wfMetaObj.put("id", WORKFLOW_ID)
            wfMetaObj.put("step", initialStage)
            wfMetaObj.put("pageTitle", pageTitle)
            stages = self.config.getJsonSimpleList(["stages"])
            for stage in stages:
                if stage.getString(None, ["name"]) == initialStage:
                    wfMetaObj.put("label", stage.getString(None, ["label"]))
                    self.item_security = stage.getStringList(["visibility"])
                    workflow_security = stage.getStringList(["security"])
                    self.message_list = stage.getStringList(["message"])

        # Has the workflow metadata changed?
        if wfChanged == True:
            inStream = IOUtils.toInputStream(wfMeta.toString(True), "UTF-8")
            try:
                StorageUtils.createOrUpdatePayload(self.object, "workflow.metadata", inStream)
            except StorageException:
                print " ERROR updating dataset payload"

        # Form processing
        coreFields = ["title", "description", "manifest", "metaList", "relationships", "responses"]
        formData = wfMeta.getObject(["formData"])
        if formData is not None:
            formData = JsonSimple(formData)
            # Core fields
            description = formData.getStringList(["description"])
            if description:
                self.descriptionList = description
            # Non-core fields
            data = formData.getJsonObject()
            for field in data.keySet():
                if field not in coreFields:
                    self.customFields[field] = formData.getStringList([field])

        # Manifest processing (formData not present in wfMeta)
        manifest = self.__getJsonPayload(self.packagePid)
        formTitles = manifest.getStringList(["title"])
        if formTitles:
            for formTitle in formTitles:
                if self.title is None:
                    self.title = formTitle
        self.descriptionList = [manifest.getString("", ["description"])]

        #Used to make sure we have a created date
        createdDateFlag  = False

        formData = manifest.getJsonObject()

        for field in formData.keySet():
            if field not in coreFields:
                value = formData.get(field)
                if value is not None and value.strip() != "":
                    self.utils.add(self.index, field, value)
                    # We want to sort by date of creation, so it
                    # needs to be indexed as a date (ie. 'date_*')
                    if field == "dc:created":
                        parsedTime = time.strptime(value, "%Y-%m-%d")
                        solrTime = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsedTime)
                        self.utils.add(self.index, "date_created", solrTime)
                        self.log.debug("Set created date to :%s" % solrTime)
                        createdDateFlag = True
                    elif field == "redbox:embargo.dc:date":
                        self.embargoedDate = value
                    elif field == "create_timestamp":
                        self.createTimeStamp = value
                    # try to extract some common fields for faceting
                    if field.startswith("dc:") and \
                            not (field.endswith(".dc:identifier.rdf:PlainLiteral") \
                              or field.endswith(".dc:identifier") \
                              or field.endswith(".rdf:resource")):
                        # index dublin core fields for faceting
                        basicField = field.replace("dc:", "dc_")
                        dot = field.find(".")
                        if dot > 0:
                            facetField = basicField[:dot]
                        else:
                            facetField = basicField
                        #print "Indexing DC field '%s':'%s'" % (field, facetField)
                        if facetField == "dc_title":
                            if self.title is None:
                                self.title = value
                        elif facetField == "dc_type":
                            if self.dcType is None:
                                self.dcType = value
                        elif facetField == "dc_creator":
                            if basicField.endswith("foaf_name"):
                                self.utils.add(self.index, "dc_creator", value)
                        else:
                            self.utils.add(self.index, facetField, value)
                        # index keywords for lookup
                        if field.startswith("dc:subject.vivo:keyword."):
                            self.utils.add(self.index, "keywords", value)
                    # check if this is an array field
                    fnameparts = field.split(":")
                    if fnameparts is not None and len(fnameparts) >= 3:
                        if field.startswith("bibo") or field.startswith("skos"):
                            arrParts = fnameparts[1].split(".")
                        else:
                            arrParts = fnameparts[2].split(".")
                        # we're not interested in: Relationship, Type and some redbox:origin
                        if arrParts is not None and len(arrParts) >= 2 and field.find(":Relationship.") == -1 and field.find("dc:type") == -1 and field.find("redbox:origin") == -1 and arrParts[1].isdigit():
                            # we've got an array field
                            fldPart = ":%s" % arrParts[0]
                            prefixEndIdx = field.find(fldPart) + len(fldPart)
                            suffixStartIdx = prefixEndIdx+len(arrParts[1])+1
                            arrFldName = self.reportingFieldPrefix + field[:prefixEndIdx] + field[suffixStartIdx:]
                            if field.endswith("Name"):
                                arrFldName = self.reportingFieldPrefix + field[:prefixEndIdx]
                            self.log.debug("Array Field name is:%s  from: %s, with value:%s" % (arrFldName, field, value))

                            if field.endswith("Name"):
                                fullFieldMap = self.arrayBucket.get(arrFldName)
                                if fullFieldMap is None:
                                    fullFieldMap = HashMap()
                                    self.arrayBucket.put(arrFldName, fullFieldMap)
                                idx = arrParts[1]
                                fullField = fullFieldMap.get(idx)
                                if (fullField is None):
                                    fullField = ""
                                if (field.endswith("givenName")):
                                    fullField = "%s, %s" % (fullField, value)
                                if (field.endswith("familyName")):
                                    fullField = "%s%s" % (value, fullField)
                                self.log.debug("fullname now is :%s" % fullField)
                                fullFieldMap.put(idx, fullField)
                            else:
                                fieldlist = self.arrayBucket.get(arrFldName)
                                if fieldlist is None:
                                    fieldlist = []
                                    self.arrayBucket.put(arrFldName, fieldlist)
                                fieldlist.append(value)

                    for compfield in self.compFields:
                        if field.startswith(compfield):
                            arrFldName = self.reportingFieldPrefix +compfield
                            fullFieldMap = self.arrayBucket.get(arrFldName)
                            if fullFieldMap is None:
                                fullFieldMap = HashMap()
                                self.arrayBucket.put(arrFldName, fullFieldMap)
                            fullField = fullFieldMap.get("1")
                            if fullField is None:
                                fullField = ""
                            if field.endswith(self.compFieldsConfig[compfield]["end"]):
                                fullField = "%s%s%s" % (fullField, self.compFieldsConfig[compfield]["delim"] ,value)
                            if field.endswith(self.compFieldsConfig[compfield]["start"]):
                                fullField = "%s%s" % (value, fullField)
                            self.log.debug("full field now is :%s" % fullField)
                            fullFieldMap.put("1", fullField)

        self.utils.add(self.index, "display_type", displayType)

        # Make sure we have a creation date
        if not createdDateFlag:
            self.utils.add(self.index, "date_created", self.last_modified)
            self.log.debug("Forced creation date to %s because it was not explicitly set." % self.last_modified)

        # Workflow processing
        wfStep = wfMeta.getString(None, ["step"])
        self.utils.add(self.index, "workflow_id", wfMeta.getString(None, ["id"]))
        self.utils.add(self.index, "workflow_step", wfStep)
        self.utils.add(self.index, "workflow_step_label", wfMeta.getString(None, ["label"]))
        for group in workflow_security:
            self.utils.add(self.index, "workflow_security", group)
            if self.owner is not None:
                self.utils.add(self.index, "workflow_security", self.owner)
        # set OAI-PMH status to deleted
        if wfStep == "retired":
            self.utils.add(self.index, "oai_deleted", "true")

    def __messages(self):
        if self.message_list is not None and len(self.message_list) > 0:
            msg = JsonSimple()
            msg.getJsonObject().put("oid", self.oid)
            message = msg.toString()
            for target in self.message_list:
                self.utils.sendMessage(target, message)

    def __getJsonPayload(self, pid):
        payload = self.object.getPayload(pid)
        json = self.utils.getJsonObject(payload.open())
        payload.close()
        return json




© 2015 - 2025 Weber Informatics LLC | Privacy Policy