uk.bl.wa.annotation.Annotator Maven / Gradle / Ivy
/**
*
*/
package uk.bl.wa.annotation;
import java.io.FileNotFoundException;
import java.io.FileReader;
/*
* #%L
* warc-indexer
* %%
* Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* .
* #L%
*/
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.archive.url.UsableURIFactory;
import org.archive.util.SurtPrefixSet;
import org.jdom.JDOMException;
import uk.bl.wa.indexer.WARCIndexer;
import uk.bl.wa.solr.SolrFields;
/**
*
* This is the core annotation class that applies the annotations to a
* SolrInputDocument.
*
* @author Roger Coram, Andrew Jackson
*
*/
public class Annotator {
private static Log LOG = LogFactory.getLog( Annotator.class );
private Annotations annotations;
private SurtPrefixSet openAccessSurts = null;
/**
* Factory method to pull annotations from ACT.
*
* @throws IOException
* @throws JDOMException
*/
public static Annotator annotationsFromAct() throws IOException,
JDOMException {
AnnotationsFromAct act = new AnnotationsFromAct();
return new Annotator(act.getAnnotations(), null);
}
/**
*
* @param surtPrefixFile
* @return
* @throws FileNotFoundException
*/
public static SurtPrefixSet loadSurtPrefix(String surtPrefixFile)
throws FileNotFoundException {
SurtPrefixSet surtPrefix = new SurtPrefixSet();
FileReader fileReader = new FileReader(surtPrefixFile);
surtPrefix.importFrom(fileReader);
return surtPrefix;
}
/**
*
* @param annotations
* @param oaSurts
*/
public Annotator(Annotations annotations, SurtPrefixSet oaSurts) {
this.annotations = annotations;
this.openAccessSurts = oaSurts;
}
/**
* Runs through the 3 possible scopes, determining the appropriate part
* of the URI to match.
*
* @param uri
* @param solr
* @throws URISyntaxException
* @throws URIException
*/
public void applyAnnotations(URI uri, SolrInputDocument solr)
throws URISyntaxException, URIException {
LOG.debug("Updating collections for "
+ solr.getField(SolrFields.SOLR_URL));
// Trac #2243; This should only happen if the record's timestamp is
// within the range set by the Collection.
// So get all the dates:
// Get all the dates:
Set crawl_dates = new HashSet();
crawl_dates.add((String) solr.getField(SolrFields.CRAWL_DATE)
.getValue());
if (solr.getField(SolrFields.CRAWL_DATES) != null) {
for (Object d : solr.getField(SolrFields.CRAWL_DATES).getValues()) {
@SuppressWarnings("unchecked")
HashMap dhm = (HashMap) d;
crawl_dates.addAll(dhm.values());
}
}
// "Just this URL"
// String normd = canon.urlStringToKey(uri.toString());
String normd = uri.toString();
LOG.debug("Comparing with " + normd);
if (this.annotations.getCollections().containsKey("resource")) {
if (this.annotations.getCollections().get("resource").keySet()
.contains(normd)) {
LOG.debug("Applying resource-level annotations...");
updateCollections(this.annotations.getCollections()
.get("resource").get(normd), solr, crawl_dates);
}
}
// "All URLs that start like this".
if (this.annotations.getCollections().containsKey("root")) {
if (this.annotations.getCollections().get("root").keySet()
.contains(normd)) {
LOG.debug("Applying root-level annotations...");
updateCollections(this.annotations.getCollections().get("root")
.get(normd), solr, crawl_dates);
}
}
// "All URLs that match match this host or any subdomains".
if (this.annotations.getCollections().containsKey("subdomains")) {
String host;
String domain = uri.getHost().replaceAll("^www\\.", "");
HashMap subdomains = this.annotations
.getCollections().get("subdomains");
for (String key : subdomains.keySet()) {
LOG.debug("Applying subdomain annotations for: " + key);
host = URI.create(key).getHost();
if (host == null) {
host = key;
}
if (host.equals(domain) || host.endsWith("." + domain)) {
updateCollections(subdomains.get(key), solr, crawl_dates);
}
}
}
// "All source_file that match this source_file_matches"
if (this.annotations.getCollections()
.containsKey("source_file_matches")) {
Pattern pattern;
Matcher matcher;
String sourceFile = (String) solr.getField(SolrFields.SOURCE_FILE)
.getValue();
HashMap sourceFileMatches = this.annotations
.getCollections().get("source_file_matches");
for (String key : sourceFileMatches.keySet()) {
LOG.debug(
"Applying source_file_matches annotations for: " + key);
pattern = Pattern.compile(key);
matcher = pattern.matcher(sourceFile);
while (matcher.find()) {
updateCollections(sourceFileMatches.get(key), solr,
crawl_dates);
}
}
}
// Some debugging info:
/*
* for (String scope : this.annotations.getCollections().keySet()) {
* System.err.println("Scope " + scope); System.err.println("GET " +
* this.annotations.getCollections() .get(scope).get(uri.toString())); }
*/
// Also use the prefix-based whitelist to note Open Access records:
if (this.openAccessSurts != null) {
LOG.debug("Attempting to apply " + this.openAccessSurts.size()
+ " OA Surts to " + uri);
String surt = SurtPrefixSet
.getCandidateSurt(
UsableURIFactory.getInstance(uri.toString()));
if (this.openAccessSurts.containsPrefixOf(surt)) {
setUpdateField(solr, SolrFields.ACCESS_TERMS, "OA");
} else {
setUpdateField(solr, SolrFields.ACCESS_TERMS, "RRO");
}
}
}
/**
* Updates a given SolrRecord with collections details from a UriCollection.
*
* @param collection
* @param solr
*/
private void updateCollections(UriCollection collection,
SolrInputDocument solr, Set crawl_dates) {
// Loop over all the dates:
for (String dateString : crawl_dates) {
Date date;
try {
date = WARCIndexer.formatter.parse(dateString);
} catch (ParseException e) {
LOG.error("Could not parse " + dateString);
continue;
}
LOG.debug("Using collection: " + collection);
// Update the single, main collection
if (collection.collection != null
&& collection.collection.length() > 0) {
if (this.annotations.getCollectionDateRanges().containsKey(
collection.collection)
&& this.annotations.getCollectionDateRanges()
.get(collection.collection)
.isInDateRange(date)) {
setUpdateField(solr, SolrFields.SOLR_COLLECTION,
collection.collection);
LOG.debug("Added collection " + collection.collection
+ " to "
+ solr.getField(SolrFields.SOLR_URL));
}
}
// Iterate over the hierarchical collections
if (collection.collections != null
&& collection.collections.length > 0) {
for (String col : collection.collections) {
LOG.debug("Considering adding collection '" + col + "' to "
+ solr.getField(SolrFields.SOLR_URL));
if (this.annotations.getCollectionDateRanges().containsKey(
col)
&& this.annotations.getCollectionDateRanges()
.get(col).isInDateRange(date)) {
setUpdateField(solr, SolrFields.SOLR_COLLECTIONS, col);
LOG.debug("Added collection '" + col + "' to "
+ solr.getField(SolrFields.SOLR_URL));
}
}
}
// Iterate over the subjects
if (collection.subject != null && collection.subject.length > 0) {
for (String subject : collection.subject) {
setUpdateField(solr, SolrFields.SOLR_SUBJECT, subject);
LOG.debug("Added collection '" + subject + "' to "
+ solr.getField(SolrFields.SOLR_URL));
}
}
}
}
private static void setUpdateField(SolrInputDocument doc, String field,
String value) {
if (doc.getField(field) == null
|| !doc.getField(field).getValues().contains(value)) {
doc.addField(field, value);
}
}
private static void setSolrUpdateField(SolrInputDocument doc, String field,
String value) {
Map operation = new HashMap();
operation.put("set", value);
// Check to see if this value is already in:
boolean newValue = true;
if (doc.getFieldValues(field) != null) {
for (Object val : doc.getFieldValues(field)) {
@SuppressWarnings("unchecked")
Map cmap = (Map) val;
if (cmap.values().contains(value))
newValue = false;
}
}
// Add it if it is a new value:
if (newValue) {
LOG.info("Adding value: " + value + " to field: " + field
+ " for URI " + doc.getFieldValue(SolrFields.SOLR_URL));
doc.addField(field, operation);
} else {
LOG.debug("Skipping addition of existing field value: " + value
+ " to field: " + field);
}
}
/**
* Pretty-print each solrDocument in the results to stdout
*
* @param out
* @param doc
*/
protected static void prettyPrint(PrintStream out, SolrInputDocument doc) {
List sortedFieldNames = new ArrayList(
doc.getFieldNames());
Collections.sort(sortedFieldNames);
out.println();
for (String field : sortedFieldNames) {
out.println(String.format("\t%s: %s", field,
doc.getFieldValues(field)));
}
out.println();
}
private static void searchAndApplyAnnotations(Annotator anr,
SolrClient solr, SolrQuery parameters)
throws SolrServerException,
URISyntaxException, IOException {
QueryResponse response = solr.query(parameters);
SolrDocumentList list = response.getResults();
for (SolrDocument doc : list) {
SolrInputDocument solrInDoc = new SolrInputDocument();
solrInDoc.setField(SolrFields.ID, doc.getFieldValue(SolrFields.ID));
solrInDoc.setField(SolrFields.CRAWL_DATE,
doc.getFieldValue(SolrFields.CRAWL_DATE));
solrInDoc.setField(SolrFields.SOLR_URL,
doc.getFieldValue(SolrFields.SOLR_URL));
String uriString = (String) solrInDoc
.getFieldValue(SolrFields.SOLR_URL);
URI uri = new URI(uriString);
// Update all of those records with the applicable
// categories etc.
anr.applyAnnotations(uri, solrInDoc);
solr.add(solrInDoc);
}
}
private static void searchAndApplyAnnotations(Annotations ann,
SurtPrefixSet oaSurts,
String solrServer) throws SolrServerException, URISyntaxException,
IOException {
// Connect to solr:
SolrClient solr = new HttpSolrClient(solrServer);
// Set up annotator:
Annotator anr = new Annotator(ann, oaSurts);
// Loop over URL known to ACT:
for (String scope : ann.getCollections().keySet()) {
if ("resource".equals(scope)) {
// Search for all matching URLs in SOLR:
for (String uriKey : ann.getCollections().get(scope).keySet()) {
LOG.info("Looking for URL: " + uriKey);
SolrQuery parameters = new SolrQuery();
parameters.set("q",
"url:" + ClientUtils.escapeQueryChars(uriKey));
searchAndApplyAnnotations(anr, solr, parameters);
}
} else if ("root".equals(scope)) {
// Search for all matching URLs in SOLR:
for (String uriKey : ann.getCollections().get(scope).keySet()) {
LOG.info("Looking for URLs starting with: " + uriKey);
SolrQuery parameters = new SolrQuery();
parameters
.set("q",
"url:"
+ ClientUtils
.escapeQueryChars(uriKey)
+ "*");
searchAndApplyAnnotations(anr, solr, parameters);
}
} else {
LOG.warn("Ignoring annotations scoped as: " + scope);
}
}
// And commit:
solr.commit();
}
/**
* @param args
* @throws IOException
* @throws JDOMException
* @throws URISyntaxException
* @throws SolrServerException
*/
public static void main(String[] args) throws IOException, JDOMException,
URISyntaxException, SolrServerException {
Annotations ann = Annotations.fromJsonFile(args[0]);
SurtPrefixSet oaSurts = Annotator.loadSurtPrefix(args[1]);
searchAndApplyAnnotations(ann, oaSurts, args[2]);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy