All Downloads are FREE. Search and download functionalities are using the official Maven repository.

uk.bl.wa.memento.client.MementosAggregator Maven / Gradle / Ivy

The newest version!
/**
 * 
 */
package uk.bl.wa.memento.client;

/*
 * #%L
 * MementoWeb Java Client
 * %%
 * Copyright (C) 2012 - 2015 The British Library
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */

import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;

import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.log4j.Logger;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import dev.memento.Memento;
import dev.memento.MementoClient;
import dev.memento.MementoList;

/**
 * @author Andrew Jackson 
 *
 */
public class MementosAggregator {
	static Logger log = Logger.getLogger(MementosAggregator.class.getCanonicalName());

	//
	private int connectionTimeoutSeconds = 5;
	
	//
	private int responseTimeoutSeconds   = 20;
	
	private PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();

	private CloseableHttpClient httpClient;

	private String[] defaultTimeMapPrefixes = {
			"http://www.webarchive.org.uk/wayback/archive/timemap/link/",
			"http://archive.today/timemap/",
			"http://wayback.archive-it.org/all/timemap/link/",
			"http://wayback.vefsafn.is/wayback/timemap/link/",
			"http://web.archive.org/web/timemap/link/",
			"http://webarchive.proni.gov.uk/timemap/",
			"https://swap.stanford.edu/timemap/link/",
			"http://webarchive.nationalarchives.gov.uk/timemap/",
			"http://webarchive.parliament.uk/timemap/"
	};
	
	private List timeMapPrefixes;
	private List timeGates;
	
	private Map icons;

	public MementosAggregator() {
		// Try to get latest list:
		try {
			getTimeMapPrefixes();
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			this.timeMapPrefixes = new ArrayList();
			for( String item : defaultTimeMapPrefixes) {
				this.timeMapPrefixes.add(item);
			}
		}
		// Output
		for( String tg : this.timeGates ) {
			log.info("TG: "+tg);
		}
		
		// Increase max total connection to 200
		cm.setMaxTotal(200);
		// Increase default max connection per route to 20
		cm.setDefaultMaxPerRoute(20);

		// Proxy?
		HttpHost proxy = null;
		if( System.getProperty("http.proxyHost") != null ) {
    		proxy = new HttpHost( System.getProperty("http.proxyHost"), 
    				Integer.parseInt(System.getProperty("http.proxyPort")), "http");
    		log.debug("Proxying via "+proxy);
    	} else {
    		log.debug("No web proxy.");
    	}
		// Set up request config:
		RequestConfig requestConfig = RequestConfig.custom()
				.setConnectTimeout(connectionTimeoutSeconds * 1000)
				.setSocketTimeout(responseTimeoutSeconds * 1000).setProxy(proxy).build();
    	// Set up the client:
		httpClient = HttpClients.custom()
				.setDefaultRequestConfig(requestConfig)
				.disableRedirectHandling()
				.setConnectionManager(cm)
			    .build();
		
	}

	/**
	 * 
	 * @param url
	 * @return 
	 * @throws InterruptedException
	 */
	public MementoList lookup(String url) throws InterruptedException {
		long start = System.currentTimeMillis();
		
		// Create a threadsafe holder for results:
		ConcurrentHashMap ms = new ConcurrentHashMap();

		// Spawn a thread for each timeMap endpoint:
		// create a thread for each URI
		GetThread[] threads = new GetThread[timeMapPrefixes.size()];
		for (int i = 0; i < threads.length; i++) {
			MementoClient httpget = new MementoClient(timeGates.get(i), httpClient);
			threads[i] = new GetThread(ms, httpget, url);
		}

		// start the threads
		for (int j = 0; j < threads.length; j++) {
			threads[j].start();
		}

		// join the threads
		for (int j = 0; j < threads.length; j++) {
			threads[j].join();
		}
		long end = System.currentTimeMillis();

		log.info("Overall, took " + (end - start) + " got "+ms.values().size()+ " mementos.");
		
		MementoList ml = new MementoList();
		for( Memento m : ms.values()) {
			ml.add(m);
		}
		
		return ml;
	}

	static class GetThread extends Thread {

		private final MementoClient mc;
		private final String url;
		private ConcurrentHashMap ms;

		public GetThread(ConcurrentHashMap ms, MementoClient mc, String url) {
			this.ms = ms;
			this.mc = mc;
			this.url = url;
		}

		@Override
		public void run() {
			long start = System.currentTimeMillis();
			try {
				mc.setTargetURI(url);
				for( Memento item: mc.getMementos()) {
					log.debug("TG+"+mc.getTimegateUri()+":\n\t"+item.getUrl());
					ms.put(item.getDateTimeString(), item);
				}
			} catch (RuntimeException ex) {
				// Handle errors
				log.error("ERROR-- " + ex + " for " + url, ex);
			}
			long end = System.currentTimeMillis();
			log.debug("TG " + mc.getTimegateUri()+ " + " + url + " took " + (end - start));
		}
	}

	private void getTimeMapPrefixes() throws Exception {
		//
		this.timeMapPrefixes = new ArrayList();
		this.timeGates = new ArrayList();
		this.icons = new HashMap();
		//
		DocumentBuilderFactory domFactory = DocumentBuilderFactory
				.newInstance();
		domFactory.setNamespaceAware(true);
		DocumentBuilder builder = domFactory.newDocumentBuilder();
		Document doc = builder
				.parse(new URL(
				"http://labs.mementoweb.org/aggregator_config/archivelist.xml")
				.openStream());
		XPath xpath = XPathFactory.newInstance().newXPath();
		// XPath Query for iterating through link objects:
		//XPathExpression expr = xpath.compile("(//links/link/timegate|//links/link/timemap)");
		XPathExpression expr = xpath.compile("//links/link");

		// Go through the link entries:
		Object result = expr.evaluate(doc, XPathConstants.NODESET);
		NodeList nodes = (NodeList) result;
		for (int i = 0; i < nodes.getLength(); i++) {
			Node n = nodes.item(i);
			// Look for timegate and timemap:
			String timeMap = xpath.evaluate("./timemap[1]/@uri", n);
			if(timeMap != null && ! timeMap.trim().isEmpty() ) {
				timeMapPrefixes.add(timeMap);
				log.info("Got TM: "+timeMap);
			}
			String timeGate = xpath.evaluate("./timegate[1]/@uri", n);
			if(timeGate != null && ! timeGate.trim().isEmpty() ) {
				timeGates.add(timeGate);
				log.info("Got TG: "+timeGate);
			}
			// Icon
			String icon = xpath.evaluate("./icon/@uri", n);
			if( icon != null && ! icon.trim().isEmpty() ) {
				String uri = timeGate;
				if( uri.endsWith("/timegate/")) {
					uri = uri.substring(0, uri.length() - 9);
				}
				this.icons.put(uri,icon);
				log.info("Got icon: "+uri+" > "+icon);
			}
		}
	}
	
	public String getIconUriForMemento( Memento m ) {
		for( String prefix: this.icons.keySet()) {
			if( m.getUrl().startsWith(prefix)) return this.icons.get(prefix);
		}
		return null;
	}

	/**
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {

		MementosAggregator me = new MementosAggregator();
		MementoList ms = me.lookup("http://www.bl.uk");
		System.out.println("Got "+ms.size()+" mementos.");
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy