dev.memento.MementoClient Maven / Gradle / Ivy
The newest version!
/**
* MementoBrowser.java
*
* Copyright 2010 Frank McCown
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
* This is the Memento Browser activity which houses a customized web browser for
* performing http queries using Memento.
*
* Learn more about Memento:
* http://mementoweb.org/
*/
package dev.memento;
/*
* #%L
* mementoweb-java-client
* %%
* Copyright (C) 2012 - 2013 The British Library
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.net.URISyntaxException;
import java.text.DateFormat;
import java.util.Calendar;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
public class MementoClient {
static Logger log = Logger.getLogger(MementoClient.class.getCanonicalName());
static final int DIALOG_DATE = 0;
static final int DIALOG_ERROR = 1;
static final int DIALOG_MEMENTO_DATES = 2;
static final int DIALOG_MEMENTO_YEARS = 3;
static final int DIALOG_HELP = 4;
private String[] mTimegateUris = { "http://timetravel.mementoweb.org/timegate/" };
private HttpClient httpClient;
// Let the TimeGate URI default to LANL Aggregator:
private String mDefaultTimegateUri = mTimegateUris[0];
private SimpleDateTime mDateChosen = new SimpleDateTime();
private TimeBundle mTimeBundle;
private HashSet mTimeMaps;
private Memento mFirstMemento;
private Memento mLastMemento;
private MementoList mMementos;
private final int MAX_NUM_MEMENTOS_IN_LIST = 20;
private CharSequence mErrorMessage;
// Used when selecting a memento
int mSelectedYear = 0;
// Used in http requests
public String mUserAgent;
private String mDefaultErrorMessage = "Sorry, but there was an unexpected error that will "
+ "prevent the Memento from being displayed. Try again in 5 minutes.";
/**
*
*/
private MementoClient() {
// Set the date and time format
SimpleDateTime.mDateFormat = DateFormat.getDateInstance(DateFormat.DEFAULT, Locale.US);
SimpleDateTime.mTimeFormat = DateFormat.getTimeInstance(DateFormat.DEFAULT, Locale.US);
// Holds all the timemaps for the web page being viewed
mTimeMaps = new HashSet();
mMementos = new MementoList();
}
/**
*
* @param timegate
*/
public MementoClient(String timegate) {
this();
this.setTimegateUri(timegate);
setupHttpClient();
}
public MementoClient(String timegate, HttpClient httpClient) {
this();
this.setTimegateUri(timegate);
this.httpClient = httpClient;
}
/**
* Helper to create a web-proxy-aware HttpClient:
* @return
*/
private void setupHttpClient() {
if( httpClient != null) {
log.debug("Using existing httpClient...");
return;
}
HttpHost proxy = null;
if( System.getProperty("http.proxyHost") != null ) {
proxy = new HttpHost( System.getProperty("http.proxyHost"),
Integer.parseInt(System.getProperty("http.proxyPort")), "http");
log.debug("Proxying via "+proxy);
} else {
log.debug("No web proxy.");
}
// Disable automatic redirect handling so we can process the 302 ourself
httpClient = HttpClientBuilder.create()
.disableRedirectHandling()
.setProxy(proxy)
.build();
}
/**
* Make http requests to the Timegate at the proxy server to obtain a Memento
* and its TimeMap. This is done in a background thread so the UI is not locked up.
* If an error occurs, mErrorMessage is set to an error message which is shown
* to the user.
* @param initUrl The URL whose Memento is to be discovered
*/
private void makeHttpRequests(String initUrl) {
// Contact Memento proxy with chosen Accept-Datetime:
// http://mementoproxy.lanl.gov/aggr/timegate/http://example.com/
// Accept-Datetime: Tue, 24 Jul 2001 15:45:04 GMT
String url = mDefaultTimegateUri + initUrl;
HttpGet httpget = new HttpGet(url);
// Change the request date to 23:00:00 if this is the first memento.
// Otherwise we'll be out of range.
String acceptDatetime;
if (mFirstMemento != null && mFirstMemento.getDateTime().equals(mDateChosen)) {
log.debug("Changing chosen time to 23:59 since datetime matches first Memento.");
SimpleDateTime dt = new SimpleDateTime(mDateChosen);
dt.setToLastHour();
acceptDatetime = dt.longDateFormatted();
}
else {
acceptDatetime = mDateChosen.longDateFormatted();
}
httpget.setHeader("Accept-Datetime", acceptDatetime);
httpget.setHeader("User-Agent", mUserAgent);
log.debug("Accessing: " + httpget.getURI());
log.debug("Accept-Datetime: " + acceptDatetime);
log.debug("HC mHR Requesting...");
HttpResponse response = null;
try {
response = httpClient.execute(httpget);
log.debug("Response code = " + response.getStatusLine());
} catch (Exception e) {
mErrorMessage = "Sorry, we are having problems contacting the server. Please " +
"try again later.";
log.error("Exception when performing query to "+this.getTimegateUri(), e);
return;
}
log.debug("HC mHR Responded.");
// Get back:
// 300 (TCN: list with multiple Mementos to choose from)
// or 302 (TCN: choice)
// or 404 (no Mementos for this URL)
// or 406 (TCN: list with only first and last Mementos)
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 300) {
// TODO: Implement. Right now the lanl proxy doesn't appear to be returning this
// code, so let's just ignore it for now.
//FIXME log.debug("Pick a URL from list - NOT IMPLEMENTED");
} else if (statusCode == 301) {
mErrorMessage = mDefaultErrorMessage;
log.info("Got 301 pointing to: "
+ response.getHeaders("Location")[0]);
log.error("Status code 301 not supported!");
} else if (statusCode == 302) {
// Send browser to Location header URL
// Note that the date/time of this memento is not given in the Location but can
// be found when parsing the Link header.
Header[] headers = response.getHeaders("Location");
if (headers.length == 0) {
mErrorMessage = mDefaultErrorMessage;
log.error("Error: Location header not found in response headers.");
}
else {
final String redirectUrl = headers[0].getValue();
// We can't update the view directly since we're running
// in a thread, so use mUpdateResults to show a toast message
// if accessing a different date than what was requested.
//mHandler.post(mUpdateResults);
// Parse various Links
headers = response.getHeaders("Link");
if (headers.length == 0) {
log.error("Error: Link header not found in response headers.");
mErrorMessage = "Sorry, but the Memento could not be accessed. Try again in 5 minutes.";
}
else {
String linkValue = headers[0].getValue();
mTimeMaps.clear();
mTimeBundle = null;
mMementos.clear();
// Get the datetime of this mememnto which should be supplied in the
// Link: headers
// Do not add the mementos to the global list of mementos because
// the global list will be created when we process the timemap later.
Memento memento = parseCsvLinks(linkValue, false);
if (mTimeMaps.size() > 0)
if (!accessTimeMap() && mErrorMessage == null)
mErrorMessage = "There were problems accessing the Memento's TimeMap. " +
"Please try again later.";
}
}
}
else if (statusCode == 404) {
//FIXME log.debug("Received 404 from proxy so no mementos for " + initUrl);
mErrorMessage = "Sorry, there are no Mementos for this web page.";
}
else if (statusCode == 406) {
// Parse various Links
Header[] headers = response.getHeaders("Link");
if (headers.length == 0) {
log.debug("Error: Link header not found in 406 response headers.");
//mErrorMessage = "Sorry, but there was an error in retreiving this Memento.";
// The lanl proxy has it wrong. It should return 404 when the URL is not
// present, so we'll just pretend this is a 404.
mErrorMessage = "Sorry, but there are no Mementos for this URL.";
}
else {
String linkValue = headers[0].getValue();
mTimeMaps.clear();
mTimeBundle = null;
mMementos.clear();
parseCsvLinks(linkValue, false);
if (mTimeMaps.size() > 0)
accessTimeMap();
if (mFirstMemento == null || mLastMemento == null) {
log.error("Could not find first or last Memento in 406 response for " + url);
mErrorMessage = "Sorry, but there was an error in retreiving this Memento.";
}
else {
log.debug("Not available in this date range (" + mFirstMemento.getDateTimeSimple() +
" to " + mLastMemento.getDateTimeSimple() + ")");
// According to Rob Sanderson (LANL), we will only get 406 when the date is too
// early, so redirect to first Memento
// FIXME ?
}
}
}
else {
mErrorMessage = "Sorry, but there was an unexpected error that will " +
"prevent the Memento from being displayed. Try again in 5 minutes.";
log.error("Unexpected response code in makeHttpRequests = " + statusCode);
}
}
/**
* Makes sure that this link contains a timemap that has not already been seen.
* @param link
* @return true if the timemap's URL already exists in the list of timemaps, false otherwise.
*/
private boolean timeMapAlreadyExists(Link link) {
for (TimeMap tm : mTimeMaps) {
if (tm.getUrl().equals(link.getUrl())) {
log.debug("Link contains a duplicate timemap URL that is being " +
"ignored: " + link.toString());
return true;
}
}
return false;
}
/**
* Parse the links in CSV format and return the date of the last item with rel="memento" since
* this information is needed when getting a 302 and needing to find the resource's datetime.
*
* Example data:
* ;rel="timebundle",
* ;rel="original",
* ;rel="first memento";datetime="Tue, 24 Jul 2001 15:45:04 GMT",
* ;rel="memento";datetime="Mon, 10 Sep 2001 20:33:50 GMT",
*
* Another example:
* ;rel="timebundle",
* ;rel="original",
* ;rel="timemap";type="application/link-format",
* ;rel="first last memento";datetime="Tue, 07 Sep 2010 11:54:29 GMT"
*
* @param links
*/
public Memento parseCsvLinks(String links, boolean addToMementoList) {
mFirstMemento = null;
mLastMemento = null;
Memento returnMemento = null;
// Dump to file for debugging
//dumpToFile(links);
String[] linkStrings = links.split("\"\\s*,");
log.debug("Start parsing " + linkStrings.length + " links");
int mementoLinks = 0;
// Place all Links into the array and then sort it based on date
for (String linkStr : linkStrings) {
// Add back "
if (!linkStr.endsWith("\""))
linkStr += "\"";
linkStr = linkStr.trim();
Link link = new Link(linkStr);
String rel = link.getRel();
if (rel.contains("memento")) {
mementoLinks++;
Memento m = new Memento(link);
// There may be just one memento in the links, so it should be returned
if (returnMemento == null)
returnMemento = m;
if (addToMementoList)
mMementos.add(m);
// Peel out all values in rel which are separated by white space
String[] items = link.getRelArray();
for (String r : items) {
r = r.toLowerCase();
// First and last should be reported in 302 response
if (r.contains("first")) {
mFirstMemento = m;
}
if (r.contains("last")) {
mLastMemento = m;
}
}
}
else if (rel.equals("timemap")) {
// See if this is really a new timemap (server could be mistaken, and
// we don't want to be caught in an infinite loop
TimeMap tm = new TimeMap(link);
if( "application/link-format".equalsIgnoreCase(tm.getType()) ) {
if (!timeMapAlreadyExists(link)) {
log.debug("Adding new timemap " + link.toString());
mTimeMaps.add(tm);
}
} else {
log.debug("Skipping timemap in unsupported format "+tm.getType());
}
}
else if (rel.equals("timebundle")) {
mTimeBundle = new TimeBundle(link);
}
}
// Sorting can take a long time. If there are just a few (like from a TimeGate),
// go ahead and sort since they are not usually listed in order. But a large
// listing from a TimeMap is already sorted by the LANL proxy.
if (addToMementoList && mMementos.size() < 5) {
log.debug("Sorting short Memento list...");
Collections.sort(mMementos);
}
log.debug("Finished parsing, found " + mementoLinks + " Memento links");
log.debug("Total mementos: " + mMementos.size());
// If these aren't set then this is likely a timemap
if (mFirstMemento == null)
mFirstMemento = mMementos.getFirst();
if (mLastMemento == null)
mLastMemento = mMementos.getLast();
return returnMemento;
}
/**
* Return a timemap that has not been downloaded yet.
*
* @return
*/
private TimeMap getTimemapToDownload() {
// if (Log.LOG) {
// Log.d(LOG_TAG, "All " + mTimeMaps.size() + " timemaps:");
// for (TimeMap tm : mTimeMaps) {
// Log.d(LOG_TAG, tm.toString());
// }
// }
for (TimeMap tm : mTimeMaps) {
if (!tm.isDownloaded())
return tm;
}
return null;
}
/**
* Retrieve the TimeMap from the Web and parse out the Mementos.
* Currently this only recognizes TimeMaps using CSV formats.
* Other formats to be implemented: RDF/XML, N3, and HTML.
* Supports paging timemaps where a timemap includes references
* to other timemaps.
*
* @return true if TimeMap was successfully retreived, false otherwise.
*/
private boolean accessTimeMap() {
TimeMap tm = getTimemapToDownload();
// Access every timemap that has been discovered
while (tm != null) {
tm.setDownloaded(true);
String url = tm.getUrl();
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", mUserAgent);
log.debug("Accessing TimeMap: " + httpget.getURI());
log.debug("HC TM Requesting...");
HttpResponse response = null;
try {
response = httpClient.execute(httpget);
log.debug("Response code = " + response.getStatusLine());
} catch (Exception e) {
log.error(Utilities.getExceptionStackTraceAsString(e));
return false;
}
log.debug("HC TM Responded.");
// Should get back 200 unless something is really wrong
int statusCode = response.getStatusLine().getStatusCode();
if (statusCode == 200) {
// See if MIME type is the same as Type
Header type = response.getFirstHeader("Content-Type");
if (type == null) {
log.warn("Could not find the Content-Type for " + url);
}
else if (!type.getValue().contains(tm.getType())) {
log.warn("Content-Type is [" + type.getValue() + "] but TimeMap type is [" +
tm.getType() + "] for " + url);
}
// Timemap MUST be "application/link-format", but leave csv for
// backwards-compatibility with earlier Memento implementations
if (tm.getType().equals("text/csv") ||
tm.getType().equals("application/link-format") ||
tm.getType().equals("application/link-format")) {
try {
String responseBody = EntityUtils.toString(response.getEntity());
parseCsvLinks(responseBody, true);
} catch (Exception ex) {
//log.error(Utilities.getExceptionStackTraceAsString(ex));
ex.printStackTrace();
return false;
}
}
else {
log.error("Unable to handle TimeMap type " + tm.getType());
return false;
}
}
else if (statusCode == 404) {
log.debug("404 response means no mementos");
mErrorMessage = "Sorry, there are no Mementos for this web page.";
return false;
}
else {
log.debug("Unexpected response code in accessTimeMap = " + statusCode);
return false;
}
tm = getTimemapToDownload();
}
return true;
}
//@Deprecated
public void setTargetURI( String target ) {
log.debug("Looking for "+target);
// Just in case an archive URL was being viewed
target = Utilities.getUrlFromArchiveUrl(target);
// Start the requests...
this.mErrorMessage = null;
this.makeHttpRequests( target );
}
//@Deprecated
public MementoList getMementos() {
return this.mMementos;
}
/**
*
* @param uri
* @return
*/
public MementoList getMementos(String uri) {
this.setTargetURI(uri);
return this.getMementos();
}
/**
* @return null if all is well.
*/
public String getErrorMessage() {
if( this.mErrorMessage == null ) return null;
return this.mErrorMessage.toString();
}
/**
* @return the mTimegateUri
*/
public String getTimegateUri() {
return mDefaultTimegateUri;
}
/**
* @param mTimegateUri the mTimegateUri to set
*/
public void setTimegateUri(String mTimegateUri) {
this.mDefaultTimegateUri = mTimegateUri;
}
public void finalise() {
// Deallocate all system resources
httpClient.getConnectionManager().shutdown();
}
/**
* Command-line utility to take a URL and look up who holds archived copies (Mementos)
* @param args
* @throws URISyntaxException
*/
public static void main( String[] args ) throws URISyntaxException {
String query = "http://www.bl.uk";
if( args.length > 0 ) {
query = args[0];
}
System.out.println("Looking for: "+query);
// Query:
MementoClient mc = new MementoClient();
long start = System.currentTimeMillis();
log.debug("Launch: "+Calendar.getInstance());
//mc.setTimegateUri("http://www.webarchive.org.uk/wayback/memento/timegate/");
mc.setTargetURI(query);
log.debug("Qdone: "+Calendar.getInstance());
long end = System.currentTimeMillis();
// Get results:
//mc.getMementos().displayAll();
log.debug("Duration: "+(end-start)/1000.0);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy