uk.bl.wa.annotation.AnnotationsFromAct Maven / Gradle / Ivy
/**
*
*/
package uk.bl.wa.annotation;
/*
* #%L
* warc-indexer
* %%
* Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* .
* #L%
*/
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.common.util.Base64;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.JsonParser;
import org.codehaus.jackson.map.ObjectMapper;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import com.google.common.base.Joiner;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
/**
*
* This downloads the data from the ACT prototype (based on Drupal) and creates
* a set of @Annotations from the appropriate taxonomy.
*
* @author Roger Coram, Andrew Jackson
*
*/
public class AnnotationsFromAct {
private String[] crawlFreqs = new String[] { "nevercrawl", "domaincrawl",
"annual", "sixmonthly", "quarterly", "monthly", "weekly", "daily" };
private static String WARC_ACT_URL = "http://www.webarchive.org.uk/act/websites/export/daily";
private static String WARC_COLLECTIONS_URL = "http://www.webarchive.org.uk/act/taxonomy_term.xml?sort=name&direction=ASC&vocabulary=5&limit=500&page=0";
private static String WARC_COLLECTIONS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=5&limit=500&page=0";
private static String WARC_SUBJECTS_URL_JSON = "http://www.webarchive.org.uk/act/taxonomy_term.json?vocabulary=2&limit=500&page=0";
private static Log LOG = LogFactory.getLog( AnnotationsFromAct.class );
private String cookie;
private String csrf;
private static final String COLLECTION_XML = "taxonomy_term";
private static final String OK_PUBLISH = "1";
private static final String FIELD_PUBLISH = "field_publish";
private static final String FIELD_DATES = "field_dates";
private static final String FIELD_NAME = "name";
private static final String FIELD_START_DATE = "value";
private static final String FIELD_END_DATE = "value2";
// Map of all categories and subjects:
private Map cm = new HashMap();
private Map sm = new HashMap();
// The annotations being built up from ACT:
private Annotations ann = new Annotations();
/**
*
* @throws IOException
* @throws JDOMException
*/
public AnnotationsFromAct() throws IOException, JDOMException {
// Populate
LOG.info("Logging into ACT...");
this.actLogin();
// Get the collections export:
LOG.info("Getting collections export from ACT...");
String collectionXml = readAct(AnnotationsFromAct.WARC_COLLECTIONS_URL);
LOG.info("Parsing collection XML...");
parseCollectionXml(collectionXml);
// Get all Targets:
LOG.info("Getting main export from ACT...");
String recordXml = readAct(AnnotationsFromAct.WARC_ACT_URL);
LOG.info("Parsing record XML...");
parseRecordXml(recordXml);
}
protected AnnotationsFromAct(String dummy) {
}
/**
* Performs login operation to ACT, setting Cookie and CSRF.
* @throws IOException
*/
private void actLogin() throws IOException {
Config loginConf = ConfigFactory
.parseFile(new File("credentials.conf"));
URL login = new URL( loginConf.getString( "act.login" ) );
LOG.info("Logging in at " + login);
HttpURLConnection connection = ( HttpURLConnection ) login.openConnection();
StringBuilder credentials = new StringBuilder();
credentials.append( loginConf.getString( "act.username" ) );
credentials.append( ":" );
credentials.append( loginConf.getString( "act.password" ) );
connection.setRequestProperty("Authorization", "Basic "
+ Base64.byteArrayToBase64(credentials.toString().getBytes()));
connection.setRequestProperty("Content-Type", "text/plain");
Scanner scanner;
if( connection.getResponseCode() != 200 ) {
scanner = new Scanner( connection.getErrorStream() );
scanner.useDelimiter( "\\Z" );
throw new IOException( scanner.next() );
} else {
scanner = new Scanner( connection.getInputStream() );
}
scanner.useDelimiter( "\\Z" );
this.csrf = scanner.next();
this.cookie = connection.getHeaderField( "set-cookie" );
}
/**
* Read data from ACT to include curator-specified metadata.
* @param conf
* @return
* @throws MalformedURLException
* @throws IOException
*/
private String readAct(String url) throws IOException {
URL act = new URL( url );
HttpURLConnection connection = ( HttpURLConnection ) act.openConnection();
if( this.cookie != null ) {
connection.setRequestProperty( "Cookie", this.cookie );
connection.setRequestProperty( "X-CSRF-TOKEN", this.csrf );
}
Scanner scanner;
if( connection.getResponseCode() != 200 ) {
scanner = new Scanner( connection.getErrorStream() );
scanner.useDelimiter( "\\Z" );
throw new IOException( scanner.next() );
} else {
scanner = new Scanner( connection.getInputStream() );
}
scanner.useDelimiter( "\\Z" );
return scanner.next();
}
/**
* Parses XML from ACT, mapping collection names to date ranges.
*
* @throws IOException
* @throws JDOMException
*
*/
@SuppressWarnings( "unchecked" )
private void parseCollectionXml( String xml ) throws JDOMException, IOException {
SAXBuilder builder = new SAXBuilder();
Document document = ( Document ) builder.build( new StringReader( xml ) );
Element rootNode = document.getRootElement();
List list = rootNode.getChildren( COLLECTION_XML );
Element node = null;
DateRange dateRange;
String name, start, end, publish;
for( int i = 0; i < list.size(); i++ ) {
node = ( Element ) list.get( i );
publish = node.getChildText( FIELD_PUBLISH );
name = node.getChildText(FIELD_NAME);
if( publish != null && publish.equals( OK_PUBLISH ) ) {
start = node.getChild( FIELD_DATES ).getChildText( FIELD_START_DATE );
end = node.getChild( FIELD_DATES ).getChildText( FIELD_END_DATE );
dateRange = new DateRange( start, end );
LOG.info("Adding collection " + name + " with dateRange "
+ dateRange);
ann.getCollectionDateRanges().put(name, dateRange);
} else {
LOG.info("Skipping collection \"" + name
+ "\" (not ok to publish)");
}
}
}
/**
* Removes inactive Collections before optionally creating a UriCollection.
*
* @param collectionCategories
* @param allCollections
* @param subject
* @return
*/
private UriCollection filterUriCollection( String collectionCategories, String allCollections, String subject ) {
UriCollection output = null;
Set validCollections = ann.getCollectionDateRanges().keySet();
if( collectionCategories != null && !validCollections.contains( collectionCategories ) )
collectionCategories = null;
ArrayList valid = new ArrayList();
if( allCollections != null ) {
for( String a : allCollections.split( "|" ) ) {
if( validCollections.contains( a ) )
valid.add( a );
}
if( valid.size() == 0 ) {
allCollections = null;
} else {
allCollections = Joiner.on( "|" ).join( valid );
}
}
valid.clear();
if( subject != null ) {
for( String s : subject.split( "|" ) ) {
if( validCollections.contains( s ) )
valid.add( s );
}
if( valid.size() == 0 ) {
subject = null;
} else {
subject = Joiner.on( "|" ).join( valid );
}
}
if( collectionCategories != null && allCollections != null && subject != null )
output = new UriCollection( collectionCategories, allCollections, subject );
return output;
}
/**
* Parses XML output from ACT into a lookup, mapping URLs to collections.
*
* @param xml
* @throws JDOMException
* @throws IOException
* @throws URISyntaxException
*/
private void parseRecordXml( String xml ) throws JDOMException, IOException {
SAXBuilder builder = new SAXBuilder();
Document document = ( Document ) builder.build( new StringReader( xml ) );
Element rootNode = document.getRootElement();
List list = rootNode.getChildren( "node" );
Element node = null;
String urls, collectionCategories, allCollections, subject, scope;
for( int i = 0; i < list.size(); i++ ) {
node = ( Element ) list.get( i );
urls = node.getChildText( "urls" );
collectionCategories = node.getChildText( "collectionCategories" );
// Trac #2271: Erroneous data in ACT might contain pipe-separated text.
if( collectionCategories != null && collectionCategories.indexOf( "|" ) != -1 ) {
collectionCategories = collectionCategories.split( "|" )[ 0 ];
}
allCollections = node.getChildText( "allCollections" );
subject = node.getChildText( "subject" );
scope = node.getChildText( "scope" );
LOG.info("Looking at scope [" + scope + "] subject [" + subject
+ "] collectionCategories [" + collectionCategories
+ "] w/ collections [" + allCollections + "]");
// As long as one of the fields is populated we have something to do...
if( collectionCategories != null || allCollections != null || subject != null ) {
UriCollection collection = filterUriCollection( collectionCategories, allCollections, subject );
LOG.info("Filtered to " + collection);
// There should be no scope beyond those created in the Constructor.
if( collection != null )
addCollection( scope, urls, collection );
}
}
for (String key : ann.getCollections().keySet()) {
LOG.info("Processed " + ann.getCollections().get(key).size()
+ " URIs for collection " + key);
}
}
/**
*
* @param scope
* @param urls
* @param collection
*/
private void addCollection( String scope, String urls, UriCollection collection ) {
LOG.debug("Adding " + urls + " to collection " + collection.toString());
HashMap relevantCollection = ann
.getCollections().get(scope);
for( String url : urls.split( "\\s+" ) ) {
if( scope.equals( "resource" ) ) {
/*
* FIXME try { // Trac #2271: try keying on canonicalized URL.
* url = canon.urlStringToKey(url); } catch( URIException u ) {
* LOG.warn("Problem parsing URL: " + u.getMessage() + ": " +
* url); }
*/
relevantCollection.put( url, collection );
} else {
URI uri;
try {
uri = new URI( url );
} catch( URISyntaxException e ) {
LOG.warn( e.getMessage() );
continue;
}
if( scope.equals( "root" ) ) {
String prefix = uri.getScheme() + "://" + uri.getHost();
relevantCollection.put( prefix, collection );
}
if( scope.equals( "subdomains" ) ) {
String host = uri.getHost();
relevantCollection.put( host, collection );
}
}
}
}
/**
*
* @return
*/
public Annotations getAnnotations() {
return ann;
}
/**
*
* @param map
* @param startUrl
* @throws IOException
*/
private void getTaxonomyViaJson(Map map, String startUrl)
throws IOException {
// Get the collections export:
String nextUrl = startUrl;
String thisUrl = null;
// Grab all the pages of collections:
do {
// Load the content:
thisUrl = nextUrl;
LOG.info("Getting taxnomy export from ACT... " + thisUrl);
String collectionXml = readAct(thisUrl);
// Map it to JsonNode tree:
ObjectMapper mapper = new ObjectMapper();
JsonParser jp = mapper.getJsonFactory().createJsonParser(
collectionXml);
JsonNode root = jp.readValueAsTree();
// Add to the map of the categories:
for (JsonNode node : root.get("list")) {
Integer ci = Integer.parseInt(node.get("tid").getTextValue());
map.put(ci, node);
}
// Look up the next URL:
nextUrl = root.path("next").getTextValue();
if( nextUrl != null)
nextUrl = nextUrl.replaceFirst("\\?", "\\.json\\?");
} while (nextUrl != null);
}
/**
*
* @throws JsonParseException
* @throws IOException
*/
private void getCollectionsViaJson() throws IOException {
// Get the subjects taxonomy:
this.getTaxonomyViaJson(sm, AnnotationsFromAct.WARC_SUBJECTS_URL_JSON);
// Get the collections taxonomy:
this.getTaxonomyViaJson(cm,
AnnotationsFromAct.WARC_COLLECTIONS_URL_JSON);
// Now patch up the parent-child relationships etc.
for (JsonNode node : cm.values()) {
// Get the parent categories:
List cats = this.resolveParents(node);
// Turn that into a string representation:
String catPath = this.getCatPath(cats);
// Look to see if the root collection is marked as published:
Boolean publish = cats.get(0).get("field_publish")
.getBooleanValue();
if (publish) {
// LOG.info("Collection Path: " + catPath + " PUBLISHED");
// Add to list of collections, w/ date ranges:
String name = catPath;
String start = null;
if (cats.get(0).get("field_dates").get("value") != null) {
start = cats.get(0).get("field_dates").get("value")
.getTextValue();
}
String end = null;
if (cats.get(0).get("field_dates").get("value2") != null) {
end = cats.get(0).get("field_dates").get("value2")
.getTextValue();
}
DateRange dateRange = new DateRange(start, end);
// LOG.info("Adding collection " + name + " with dateRange "
// + dateRange);
ann.getCollectionDateRanges().put(name, dateRange);
} else {
LOG.debug("Skipping unpublished collection with path: "
+ catPath);
}
}
}
/**
*
* @param cats
* @return
*/
private String getCatPath(List cats) {
// Build up the full path string:
StringBuilder catPath = new StringBuilder();
for (int i = 0; i < cats.size(); i++) {
JsonNode cat = cats.get(i);
catPath.append(cat.get("name").getTextValue());
// Append a separator if this is not the last entry:
if (i < cats.size() - 1)
catPath.append("|");
}
return catPath.toString();
}
/**
*
* @param c
* @param cats
*/
private void resolveParents(JsonNode c, List cats) {
// Store this item:
cats.add(0, c);
// Loop through the parents (although there is only ever one in this
// dataset):
for (JsonNode parentRef : c.get("parent")) {
Integer ci = parentRef.get("id").getIntValue();
JsonNode parent = cm.get(ci);
resolveParents(parent, cats);
}
}
private List resolveParents(JsonNode c) {
// Get the parent categories:
List cats = new ArrayList();
// Find all the parents:
this.resolveParents(c, cats);
// And return:
return cats;
}
/**
*
* @throws IOException
*/
private void getTargetsViaJson() throws IOException {
String actUrl = "http://www.webarchive.org.uk/act/node.json?type=url";
int page = 0;
int max_page = -1;
do {
page++;
LOG.info("Getting page " + page + " of targets export from ACT... "
+ actUrl);
String targets = readAct(actUrl);
ObjectMapper mapper = new ObjectMapper();
JsonParser jp = mapper.getJsonFactory().createJsonParser(targets);
JsonNode root = jp.readValueAsTree();
for (JsonNode node : root.get("list")) {
String scope = node.get("field_scope").getTextValue();
LOG.debug("Got \"" + node.get("title").getTextValue()
+ "\" with scope: " + scope);
String collectionCategories = null;
List allCollections = new ArrayList();
String[] subjects = null;
// Add on the categories:
for (JsonNode cat : node.get("field_collection_categories")) {
Integer cid = Integer
.parseInt(cat.get("id").getTextValue());
JsonNode catd = cm.get(cid);
if (catd == null) {
LOG.warn("NULL catd for id=" + cid + " from: "
+ node.asText());
continue;
}
LOG.debug("collectionCategories: "
+ catd.get("name").getTextValue());
// Get the parent categories:
List catds = this.resolveParents(catd);
// Turn that into a string representation:
String catPath = this.getCatPath(catds);
allCollections.add(catPath);
if (collectionCategories == null) {
collectionCategories = catds.get(0).get("name")
.getTextValue();
}
}
// Get the Subject:
if( node.get("field_subject") != null ) {
Integer sid = Integer.parseInt(node.get("field_subject")
.get("id").getTextValue());
String subject = sm.get(sid).get("name").getTextValue();
LOG.debug("Found a SUBJECT: "
+ node.get("field_subject").get("id") + " > "
+ subject);
subjects = new String[] { subject };
}
UriCollection uc = new UriCollection(collectionCategories,
allCollections.toArray(new String[1]), subjects);
for (JsonNode url : node.get("field_url")) {
LOG.debug("Got " + url.get("url").getTextValue());
// Add to the collection:
addCollection(scope, url.get("url").getTextValue(), uc);
}
}
// Look up the next page URL:
actUrl = root.path("next").getTextValue();
if (actUrl != null)
actUrl = actUrl.replaceFirst("\\?", "\\.json\\?");
} while (actUrl != null && (page < max_page || max_page < 0));
// Summarise the result:
for (String key : ann.getCollections().keySet()) {
LOG.info("Processed " + ann.getCollections().get(key).size()
+ " URIs for collection " + key);
}
}
/**
*
* @param args
* @throws IOException
* @throws MalformedURLException
* @throws JsonParseException
* @throws JDOMException
*/
public static void main(String[] args) throws JsonParseException,
MalformedURLException, IOException, JDOMException {
// Populate
LOG.info("Logging into ACT...");
AnnotationsFromAct act = new AnnotationsFromAct("dummy");
act.actLogin();
act.getCollectionsViaJson();
act.getTargetsViaJson();
String filename = "annotations.json";
LOG.info("Writing annotations to: " + filename);
act.getAnnotations().toJsonFile(filename);
LOG.info("...done.");
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy