harvest.zotero.py Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of plugin-harvester-filesystem Show documentation
Performs a batch harvest of any filesystem that can support Java
The newest version!
import md5, os, time
from com.googlecode.fascinator.indexer.rules import AddField, New
from sets import Set
import org.ontoware.rdf2go as rdf2go
import java.io as io


#
# Available objects:
#    indexer    : Indexer instance
#    jsonConfig : JsonConfigHelper of our harvest config file
#    rules      : RuleManager instance
#    object     : DigitalObject to index
#    payload    : Payload to index
#    params     : Metadata Properties object
#    pyUtils    : Utility object for accessing app logic
#

def indexList(name, values):
    for value in values:
        rules.add(AddField(name, value))
        
def runQuery(query):
    q = "PREFIX bib: \n \
    PREFIX dc: \n \
    PREFIX dcterms: \n \
    PREFIX prism: \n \
    PREFIX foaf: \n \
    PREFIX vcard: \n \
    PREFIX link: \n \
    PREFIX z: \n" \
    + query
    print q
    result = model.sparqlSelect(q)
    return result

def addValue(list, node):
    if node:
        list.add(node.toString())

def getDocumentDetails(id):
    queryString = "SELECT ?id ?title ?type ?subject ?date \
            WHERE { OPTIONAL {" + id + " dc:title ?title } \
            . OPTIONAL {" + id + " dc:identifier ?id} \
            . OPTIONAL {" + id + " dc:subject ?subject} \
            . OPTIONAL {" + id + " dc:date ?date} \
            . OPTIONAL {" + id + " z:itemType ?type " + "}}"

    result = runQuery(queryString)
    for row in result:
        print "Record: " + row.toString()
        addValue(titleList, row.getValue("title"))
        addValue(idList, row.getValue("id"))
        addValue(subjectList, row.getValue("subject"))
        addValue(dateList, row.getValue("date"))
        addValue(typeList, row.getValue("type"))
        
        

def getDocuments():
    result = runQuery("SELECT ?document WHERE { { ?document a bib:Document } \
    UNION { ?document a bib:Article} \
    UNION { ?document a bib:AcademicArticle} \
    UNION { ?document a bib:AudioDocument} \
    UNION { ?document a bib:AudioVisualDocument} \
    UNION { ?document a bib:Film} \
    UNION { ?document a bib:Book} \
    UNION { ?document a bib:Proceedings} \
    UNION { ?document a bib:CollectedDocument} \
    UNION { ?document a bib:EditedBook} \
    UNION { ?document a bib:Issue} \
    UNION { ?document a bib:DocumentPart} \
    UNION { ?document a bib:BookSection} \
    UNION { ?document a bib:Excerpt} \
    UNION { ?document a bib:Slide} \
    UNION { ?document a bib:Image} \
    UNION { ?document a bib:Map} \
    UNION { ?document a bib:LegalDocument} \
    UNION { ?document a bib:LegalCaseDocument} \
    UNION { ?document a bib:Legislation} \
    UNION { ?document a bib:Manual} \
    UNION { ?document a bib:Manuscript} \
    UNION { ?document a bib:Note} \
    UNION { ?document a bib:Patent} \
    UNION { ?document a bib:PersonalCommunicationDocument} \
    UNION { ?document a bib:Email} \
    UNION { ?document a bib:Letter} \
    UNION { ?document a bib:ReferenceSource} \
    UNION { ?document a bib:Report} \
    UNION { ?document a bib:Slideshow} \
    UNION { ?document a bib:Standard} \
    UNION { ?document a bib:Thesis} \
    UNION { ?document a bib:Webpage} \
    }")
    for row in result:
        print "Record: " + row.toString()
        getDocumentDetails(row.getValue("document").toSPARQL())


#start with blank solr document
rules.add(New())

titleList = Set()
idList = Set()
subjectList = Set()
dateList = Set()
typeList = Set()

#common fields
oid = object.getId()
rules.add(AddField("id", oid))
rules.add(AddField("storage_id", oid))
rules.add(AddField("item_type", "object"))
rules.add(AddField("last_modified", time.strftime("%Y-%m-%dT%H:%M:%SZ")))
rules.add(AddField("harvest_config", params.getProperty("jsonConfigOid")))
rules.add(AddField("harvest_rules",  params.getProperty("rulesOid")))

rules.add(AddField("repository_name", params["repository.name"]))
rules.add(AddField("repository_type", params["repository.type"]))

# Security
roles = pyUtils.getRolesWithAccess(oid)
if roles is not None:
    for role in roles:
        rules.add(AddField("security_filter", role))
else:
    # Default to guest access if Null object returned
    schema = pyUtils.getAccessSchema("derby");
    schema.setRecordId(oid)
    schema.set("role", "guest")
    pyUtils.setAccessSchema(schema, "derby")
    rules.add(AddField("security_filter", "guest"))

model = rdf2go.RDF2Go.getModelFactory().createModel()
model.open()
model.readFrom(io.FileReader(io.File(oid)))
getDocuments()
model.close()

indexList("dc_title", titleList)
indexList("dc_identifier", idList)
indexList("dc_subject", subjectList)
indexList("dc_type", typeList)
#indexList("dc_contributor", contributorList)
#indexList("dc_description", descriptionList)
#indexList("dc_format", formatList)
indexList("dc_date", dateList)