All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.scijava.util.MirrorWebsite Maven / Gradle / Ivy

/*
 * #%L
 * SciJava Common shared library for SciJava software.
 * %%
 * Copyright (C) 2009 - 2017 Board of Regents of the University of
 * Wisconsin-Madison, Broad Institute of MIT and Harvard, Max Planck
 * Institute of Molecular Cell Biology and Genetics, University of
 * Konstanz, and KNIME GmbH.
 * %%
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 * #L%
 */

package org.scijava.util;

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

/**
 * This program mirrors a given website.
 * 

* Its primary purpose is to provide the code necessary to keep ImageJ Mirror up-to-date. *

* * @author Johannes Schindelin */ public class MirrorWebsite { public final static int THREAD_COUNT = 20; public final static long DELAY_IN_MICROSECONDS = 0; private String baseURL; private String basePath; // the local directory for file:// baseURL, otherwise null private File localDirectory; private Map linkMap = new HashMap<>(); private Set missingLinks = new LinkedHashSet<>(); private ExecutorService executorService; private Map jobs; private Set done; private int threadCount; private long delay; public MirrorWebsite(final String baseURL, final File localDirectory, final int threadCount, final long delay) { this.baseURL = baseURL + (baseURL.endsWith("/") ? "" : "/"); this.basePath = baseURL.startsWith("file:") ? baseURL.substring(5) : null; this.localDirectory = localDirectory; this.threadCount = threadCount; this.delay = delay; } public void run() throws InterruptedException { synchronized (this) { if (jobs != null) throw new RuntimeException("Mirroring already in progress!"); executorService = Executors.newFixedThreadPool(threadCount); done = new TreeSet<>(); jobs = new LinkedHashMap<>(); mirror("index.html"); } executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS); } public void mirror(String path) { final MirrorJob job; synchronized (this) { if (jobs.containsKey(path)) return; job = new MirrorJob(path); jobs.put(path, job); } try { executorService.execute(job); } catch (Throwable t) { t.printStackTrace(); done.add(path); } } private static long getRemoteTimestamp(String url) throws IOException { URLConnection connection = null; try { connection = new URL(url).openConnection(); } catch (FileNotFoundException e) { if (url.endsWith("/index.html")) connection = new URL(url.substring(0, url.length() - 10)).openConnection(); else throw e; } if (connection instanceof HttpURLConnection) ((HttpURLConnection)connection).setRequestMethod("HEAD"); connection.setUseCaches(false); long lastModified = connection.getLastModified(); connection.getInputStream().close(); return lastModified; } // returns 0 if it is up-to-date, otherwise the desired lastModified private long upToDate(String path) throws IOException { long remote = getRemoteTimestamp(baseURL + path); File file = new File(localDirectory, path); if (!file.exists()) return remote; long local = file.lastModified(); return remote < 0 || local == remote ? 0 : remote; } private String getValue(String html, int startOffset) { int offset = startOffset; while (offset < html.length() && (html.charAt(offset) == '\n' || html.charAt(offset) == '\r' || html.charAt(offset) == ' ')) offset++; if (offset + 1 >= html.length()) return ""; char delim = ' ', delim2 = '>'; char c = html.charAt(offset); if (c == '"' || c == '\'') { delim = delim2 = c; offset++; } for (int end = offset; end < html.length(); end++) if (html.charAt(end) == delim || html.charAt(end) == delim2) return html.substring(offset, end); return html.substring(offset); } private void addLinkRelation(List result, String sourceURL, String url) { String normalized = normalizeURL(url); if (normalized == null) return; result.add(normalized); synchronized(linkMap) { String previous = linkMap.get(normalized); if (previous == null) linkMap.put(normalized, sourceURL); else if ((" " + previous + " ").indexOf(" " + sourceURL + " ") < 0) linkMap.put(normalized, previous + " " + sourceURL); } } private List getLinks(String relativePath, String path, String html) { List result = new ArrayList<>(); int offset = -1; for (;;) { int newOffset = -1; for (String pattern : new String[] { " href=", " src=", " HREF=", " SRC=" }) { int tmp = html.indexOf(pattern, offset + 1); if (tmp >= 0 && (newOffset < 0 || newOffset > tmp)) newOffset = tmp + pattern.length(); } if (newOffset < 0) break; offset = newOffset; String value = getValue(html, offset); offset += value.length(); if (value.startsWith("mailto:") || value.startsWith("MAILTO:")) continue; for (char c : new char[] { '#', '?', ';' }) { int hash = value.indexOf(c); if (hash >= 0) value = value.substring(0, hash); } if (value.endsWith("/")) value += "index.html"; if (value.startsWith("/")) { int colon = baseURL.indexOf("://"); int slash = baseURL.indexOf('/', colon + 3); value = baseURL.substring(0, slash) + value; } else if (value.indexOf("://") < 0) { if (!value.equals("")) addLinkRelation(result, path, relativePath + value); if (offset < 0) break; continue; } if (value.startsWith(baseURL)) addLinkRelation(result, path, value.substring(baseURL.length())); if (offset < 0) break; } return result; } private static boolean isHTML(String path) { String lower = path.toLowerCase(); return lower.endsWith(".htm") || lower.endsWith(".html"); } private static void copyStream(InputStream in, StringBuffer string, OutputStream out) throws IOException { byte[] buffer = new byte[65536]; for (;;) { int count = in.read(buffer); if (count < 0) break; if (string != null) string.append(new String(buffer, 0, count)); if (out != null) out.write(buffer, 0, count); } in.close(); if (out != null) out.close(); } private List ensureUptodate(String path) throws IOException { StringBuffer string = new StringBuffer(); String relativePath = path.substring(0, path.lastIndexOf('/') + 1); File file = new File(localDirectory, path); // special-case local case: file:/.../ does not list the directory contents if (basePath != null && ("/" + path).endsWith("/index.html") && !new File(basePath + path).exists()) { final String directory = path.substring(0, path.length() - 10); final File[] list = new File(basePath + directory).listFiles(); if (list == null) return Collections.emptyList(); final List result = new ArrayList<>(); for (final File item : list) { if (item.isDirectory()) result.add(directory + item.getName() + "/index.html"); else result.add(directory + item.getName()); } return result; } long remoteLastModified; try { remoteLastModified = upToDate(path); if (remoteLastModified == 0) { if (!isHTML(path)) return Collections.emptyList(); copyStream(new FileInputStream(file), string, null); return getLinks(relativePath, path, string.toString()); } } catch (FileNotFoundException e) { if (!path.endsWith("/index.html")) throw e; remoteLastModified = -1; } InputStream in = null; try { in = new URL(baseURL + path).openStream(); } catch (MalformedURLException e) { throw new MalformedURLException(baseURL + path); } catch (FileNotFoundException e) { if (path.endsWith("/index.html")) in = new URL(baseURL + path.substring(0, path.length() - 10)).openStream(); else throw e; } System.err.println("Downloading " + path); File tmp = new File(localDirectory, path + ".download.tmp"); tmp.getParentFile().mkdirs(); FileOutputStream out = new FileOutputStream(tmp); if (isHTML(path)) { copyStream(in, string, null); String rewritten = string.toString() .replaceAll("http://rsb.info.nih.gov", "http://imagej.nih.gov"); String replacement = "", path2 = path; for (;;) { path2 = path2.substring(0, path2.lastIndexOf('/') + 1); rewritten = rewritten.replaceAll(baseURL + path2, replacement); // special-case rewriting from a local mirror if (basePath != null) { rewritten = rewritten.replaceAll("http://imagej.nih.gov/ij/" + path2, replacement); } if (path2.equals("")) break; // strip trailing slash path2 = path2.substring(0, path2.length() - 1); replacement = "../" + replacement; } copyStream(new ByteArrayInputStream(rewritten.getBytes()), null, out); } else copyStream(in, null, out); tmp.renameTo(file); if (remoteLastModified >= 0) file.setLastModified(remoteLastModified); if (!isHTML(path)) return Collections.emptyList(); return getLinks(relativePath, path, string.toString()); } private static String normalizeURL(String originalPath) { String path = originalPath; for (;;) { int dot = path.indexOf("/./"); if (dot >= 0) { path = path.substring(0, dot) + path.substring(dot + 2); continue; } int dotdot = path.indexOf("/../"); if (dotdot < 0) break; if (dotdot == 0) return null; int slash = path.lastIndexOf(dotdot - 1); if (slash < 0) return null; path = path.substring(0, slash) + path.substring(dotdot + 3); } if (path.startsWith("../")) throw new RuntimeException("ignore"); return path; } private void reportMissingLinks() { if (missingLinks.size() == 0) return; System.err.println("Found broken links:"); for (final String path : missingLinks) { final String source = linkMap.get(path); System.err.println(path + (source == null ? "" : " (linked from " + source + ")")); } } private class MirrorJob implements Runnable { private String path; public MirrorJob(String path) { this.path = path; } @Override public void run() { try { System.err.println("Looking at " + path + " (" + (1 + done.size()) + "/" + jobs.size() + ")"); for (String path2 : ensureUptodate(path)) try { mirror(path2); } catch (Throwable e) { System.err.println("" + e); } } catch (FileNotFoundException e) { String source = linkMap.get(path); System.err.println("" + e + (source == null ? "" : " (linked from " + source + ")")); missingLinks.add(path); } catch (Throwable e) { System.err.println("Error while trying to mirror " + path); e.printStackTrace(); } if (delay > 0) try { Thread.sleep(delay); } catch (InterruptedException e) { // ignore } synchronized (MirrorWebsite.this) { done.add(path); if (done.size() == jobs.size()) { executorService.shutdown(); reportMissingLinks(); } } } } private static void usage() { System.err.println("Usage: MirrorWebsite [




© 2015 - 2025 Weber Informatics LLC | Privacy Policy