All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.format.gzip.zipnum.ZipNumCluster Maven / Gradle / Ivy

The newest version!
package org.archive.format.gzip.zipnum;
/**
 * ZipNumCluster
 * 
 * A ZipNumIndex representing multiple shards which can be loaded dynamically. The shard locations are loaded dynamically
 * from a specified file and can be reloaded at a specified internval.
 *   Files used
 *   - ALL.loc - a required file specifying \t[\t]
 *   - ALL.lastblocks - a file specifying size of last blocks in each shard. This is optional and only used for size calculation.
 * 
 */

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.util.ArchiveUtils;
import org.archive.util.GeneralURIStreamFactory;
import org.archive.util.binsearch.SeekableLineReader;
import org.archive.util.binsearch.SeekableLineReaderFactory;
import org.archive.util.binsearch.SeekableLineReaderIterator;
import org.archive.util.binsearch.impl.HTTPSeekableLineReader;
import org.archive.util.iterator.CloseableIterator;

public class ZipNumCluster extends ZipNumIndex {
	
	final static Logger LOGGER = Logger.getLogger(ZipNumCluster.class.getName());
	
	protected final static CloseableIterator EMPTY_ITERATOR = new CloseableIterator()
	{
		@Override
		public boolean hasNext() {
			return false;
		}

		@Override
		public String next() {
			return null;
		}

		@Override
		public void remove() {
			
		}

		@Override
		public void close() throws IOException {
			
		}
	};
	
	private class LocationUpdater implements Runnable
	{
		@Override
		public void run() {
			try {
				while (true) {
					long currModTime = locReaderFactory.getModTime();
					
					if (currModTime != lastModTime) {
						syncLoad(currModTime);
						
						Thread.sleep(checkInterval);
						
						if (summary != null) {
							summary.reloadFactory();
						}
					}
					
					Thread.sleep(checkInterval);
				}
			} catch (InterruptedException ie) {
				
			}
		}
	}
	
	protected HashMap locMap = null;
	protected SeekableLineReaderFactory locReaderFactory = null;
	protected String locFile;
	
	protected long lastModTime = 0;
	
	protected int checkInterval = 30000;
	
	protected Thread updaterThread;
	
	
	public final static String EARLIEST_TIMESTAMP = "_EARLIEST";
	public final static String LATEST_TIMESTAMP = "_LATEST";	
	public final static String OFF = "OFF";
	
	protected SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
	protected Date startDate, endDate;
	
	class BlockSize
	{
		String urltimestamp;
		long count;
	}
	
	protected BlockSize[] lastBlockSizes = new BlockSize[0];
	protected String blockSizesFile;
	
	protected String locRoot = null, newLocRoot = null;
	
	protected long totalAdjustment = 0;
	
	protected Date newStartDate, newEndDate;
	protected boolean newIsDisabled = false;
	protected boolean disabled = false;
	
	//final static int DEFAULT_LOC_CACHE_EXPIRE_MILLIS = 120000;
	
	protected ConcurrentHashMap locCacheMap;
	
	protected boolean cacheRemoteLoc = false;

	protected int locCacheExpireMillis = 120000;
	
	protected int locCacheMaxDuration = 1000;
	
	class LocCacheEntry
	{
		String loc;
		long expire;
		
		LocCacheEntry(String loc, long expire) 
		{ 
			this.loc = loc;
			this.expire = expire; 
		}
		
		public boolean equals(Object obj)
		{
			if (obj == null) {
				return false;
			}
			
			if (obj instanceof String) {
				return loc.equals(obj);
			}
			
			if (obj instanceof LocCacheEntry) {
				return loc.equals(((LocCacheEntry)obj).loc);
			}
			
			return false;
		}
	}	
	
	@Override
	public void init() throws IOException
	{
		super.init();
		
		this.blockSizesFile = locFile.replaceAll(".loc", ".lastblocks");
		
		locMap = new HashMap();
		
		if (cacheRemoteLoc) {
			locCacheMap = new ConcurrentHashMap();
		}
		
		try {
			locReaderFactory = GeneralURIStreamFactory.createSeekableStreamFactory(locFile, false);
			lastModTime = locReaderFactory.getModTime();
		
			loadPartLocations(locMap);

		} catch (IOException io) {
			LOGGER.warning("Exception on Load -- Disabling Cluster! " + io.toString());
			disabled = true;
			return;
		}
		
		disabled = newIsDisabled;
		startDate = newStartDate;
		endDate = newEndDate;
		locRoot = newLocRoot;
		this.cdxLinesTotalCount = computeTotalLines();
		
		if (!disabled) {
			this.loadLastBlockSizes(blockSizesFile);
		}
		
		if (checkInterval > 0) {
			updaterThread = new Thread(new LocationUpdater(), "LocationUpdaterThread");
			updaterThread.start();
		}
	}
	
	protected void syncLoad(long newModTime)
	{
		HashMap destMap = new HashMap();
		
		try {
			loadPartLocations(destMap);
		} catch (IOException e) {
			LOGGER.warning(e.toString());
			return;
		}
		
		if (!disabled) {
			this.loadLastBlockSizes(blockSizesFile);
		}
		
		if (LOGGER.isLoggable(Level.INFO)) {
			LOGGER.info("*** Location Update: " + locFile);
		}
		
		ArrayList filesToClose = new ArrayList();
		
		synchronized (this) {
			for (Entry files : destMap.entrySet()) {
				String[] existingFiles = locMap.get(files.getKey());
				
				String[] newFiles = files.getValue();
				
				if ((existingFiles != null) && !Arrays.equals(existingFiles, newFiles)) {					
					filesToClose.add(existingFiles);
				}
				
				locMap.put(files.getKey(), newFiles);
			}
			
			//locMap.putAll(destMap);
			
			startDate = newStartDate;
			endDate = newEndDate;
			disabled = newIsDisabled;
			locRoot = newLocRoot;
			
			this.cdxLinesTotalCount = computeTotalLines();
		}
		
		if (this.locCacheMap != null) {
			locCacheMap.clear();
		}
		
		closeExistingFiles(filesToClose);
		
		lastModTime = newModTime;
	}
	
	private void closeExistingFiles(ArrayList filesToClose) {
		for (String[] files : filesToClose) {
			for (String file : files) {
				try {
					blockLoader.closeFileFactory(file);
				} catch (IOException e) {
					LOGGER.warning(e.toString());
				}
			}
		}
	}

	public synchronized String[] getLocations(String key)
	{
		return locMap.get(key);
	}
	
	public String getLocRoot()
	{
		return locRoot;
	}
	
	public String getLocFile()
	{
		return locFile;
	}
	
	public void setLocFile(String locFile)
	{
		this.locFile = locFile;
	}
	
	public int getLocCacheExpireMillis() {
		return locCacheExpireMillis;
	}

	public void setLocCacheExpireMillis(int locCacheExpireMillis) {
		this.locCacheExpireMillis = locCacheExpireMillis;
	}

	public int getLocCacheMaxDuration() {
		return locCacheMaxDuration;
	}

	public void setLocCacheMaxDuration(int locCacheMaxDuration) {
		this.locCacheMaxDuration = locCacheMaxDuration;
	}

	public boolean isCacheRemoteLoc() {
		return cacheRemoteLoc;
	}

	public void setCacheRemoteLoc(boolean cacheRemoteLoc) {
		this.cacheRemoteLoc = cacheRemoteLoc;
	}

	protected Date parseDate(String date)
	{
		try {
			return dateFormat.parse(date);
		} catch (ParseException e) {
			return null;
		}
	}
	
	public boolean dateRangeCheck(String key)
	{
		// Allow a cluster to be "disabled" by specifying an empty ALL.loc
		if (disabled) {
			return false;
		}
		
		if ((startDate == null) && (endDate == null)) {
			return true;
		}
		
		int spaceIndex = key.indexOf(' ');
		if (spaceIndex < 0) {
			return true;
		}
		
		String dateStr = key.substring(spaceIndex + 1);
		Date reqDate = null;
		
		try {
			reqDate = ArchiveUtils.getDate(dateStr);
		} catch (ParseException e) {
			return true;
		}
		
		if ((startDate != null) && reqDate.before(startDate)) {
			return false;
		}
		
		if ((endDate != null) && reqDate.after(endDate)) {
			return false;
		}
		
		return true;
	}
	
	protected void loadLastBlockSizes(String filename)
	{
		BufferedReader reader = null;
		
		String line = null;
		
		List list = new ArrayList();
		totalAdjustment = 0;
		
		try {
			reader = new BufferedReader(new FileReader(filename));
			
			while ((line = reader.readLine()) != null) {
				String[] splits = line.split("\t");
				
				BlockSize block = new BlockSize();
				block.count = Long.parseLong(splits[1]);
				block.urltimestamp = splits[2];
				list.add(block);
				totalAdjustment += block.count;
			}
		} catch (Exception e) {
			LOGGER.warning(e.toString());

		} finally {
			if (reader != null) {
				try {
					reader.close();
				} catch (IOException e) {
					LOGGER.warning(e.toString());
				}
			}
		}
		
		lastBlockSizes = list.toArray(new BlockSize[list.size()]);
	}
	
	protected void loadPartLocations(HashMap destMap) throws IOException
	{
		SeekableLineReaderIterator lines = null;
		
		newStartDate = newEndDate = null;
		newIsDisabled = false;
		
		try {
			
			lines = new SeekableLineReaderIterator(locReaderFactory.get());
			
			while (lines.hasNext()) {
				String line = lines.next();
				
				if (line.isEmpty()) {
					continue;
				}
				
				String[] parts = line.split("\\t");
				
				if (parts[0].equals(OFF)) {
					newIsDisabled = true;
					break;
				}
				
				if (parts.length < 2) {
					String msg = "Bad line(" + line + ") in (" + locFile + ")";
					LOGGER.warning(msg);
					continue;
				}
				
				if (parts[0].equals(EARLIEST_TIMESTAMP)) {
					newStartDate = parseDate(parts[1]);
					continue;
				} else if (parts[0].equals(LATEST_TIMESTAMP)) {
					newEndDate = parseDate(parts[1]);
					continue;
				}
				
				String locations[] = new String[parts.length - 1];
				
				if (newLocRoot == null) {
					int lastSlash = parts[1].lastIndexOf('/');
					newLocRoot = parts[1].substring(0, lastSlash + 1);
				}
			
				for (int i = 1; i < parts.length; i++) {
					locations[i-1] = parts[i];
				}
				
				destMap.put(parts[0], locations);
			}
		} finally {
			if (lines != null) {
				lines.close();
			}
		}
	}

	public int getCheckInterval() {
		return checkInterval;
	}

	public void setCheckInterval(int checkInterval) {
		this.checkInterval = checkInterval;
	}

	public long getTotalAdjustment() {
		return totalAdjustment;
	}

	public int getNumBlocks() {
		return lastBlockSizes.length;
	}
	
	public long getLastBlockDiff(String startKey, int startPart, int endPart) {
		if (startPart >= lastBlockSizes.length || endPart >= lastBlockSizes.length) {
			return 0;
		}
		
		if (startKey.equals(lastBlockSizes[startPart].urltimestamp)) {
			startPart++;
		}
		
		long diff = 0;
		
		for (int i = startPart; i < endPart; i++) {
			diff += lastBlockSizes[i].count;
			diff -= this.getCdxLinesPerBlock();
		}
		
		return diff;
	}
	

	// Adjust from shorter blocks, if loaded
	public long computeTotalLines()
	{		
		long numLines = 0;
		
		try {
			numLines = this.getNumLines(summary.getRange("", ""));
		} catch (IOException e) {
			LOGGER.warning(e.toString());
			return 0;
		}
		
		long adjustment = getTotalAdjustment();
		numLines -= (getNumBlocks() - 1);
		numLines *= this.getCdxLinesPerBlock();
		numLines += adjustment;
		return numLines;
	}
	
	public CloseableIterator getCDXIterator(String key, String start, String end, ZipNumParams params) throws IOException {
		
		if (!dateRangeCheck(key)) {
			return EMPTY_ITERATOR;
		}
		
		return super.getCDXIterator(key, start, end, params);
	}
	
	public CloseableIterator getCDXIterator(String key, String prefix, boolean exact, ZipNumParams params) throws IOException {
		
		if (!dateRangeCheck(key)) {
			return EMPTY_ITERATOR;
		}
		
		return super.getCDXIterator(key, prefix, exact, params);
	}
	
	public boolean isDisabled() {
		return this.disabled;
	}
		
	@Override
	SeekableLineReader doBlockLoad(String partId, long startOffset, int totalLength) {
		
		SeekableLineReader reader = null;
		
		String[] locations = getLocations(partId);
		
		if (locations == null) {
			LOGGER.severe("No locations for block(" + partId +")");
			return null;
		}
		
		// Attempt cached load for http
		if (cacheRemoteLoc && (locCacheMap != null) && (locations.length > 0) && GeneralURIStreamFactory.isHttp(locations[0])) {
			reader = loadCachedBalancedReader(partId, locations, startOffset, totalLength);
		} else {
		// Standard block load path
			for (String location : locations) {
				reader = blockLoader.attemptLoadBlock(location, startOffset, totalLength, true, isRequired());
				if (reader != null) {
					return reader;
				}
			}
		}
		
		return reader;
	}
		
	protected String locCacheGet(String key)
	{
		LocCacheEntry entry = locCacheMap.get(key);
		
		if (entry == null) {
			return null;
		}
		
		if (System.currentTimeMillis() > entry.expire) {
			locCacheMap.remove(key);
			return null;
		}
		
		return entry.loc;
	}
	
	protected void locCachePut(String key, String loc)
	{
		locCacheMap.putIfAbsent(key, new LocCacheEntry(loc, System.currentTimeMillis() + locCacheExpireMillis));
	}
	
	SeekableLineReader loadCachedBalancedReader(String partId, String[] locations, long offset, int length)
	{
		SeekableLineReader reader = null;
		
		String cachedUrl = locCacheGet(partId);
		
		if (cachedUrl != null) {
			long start = System.currentTimeMillis();
			
			reader = blockLoader.attemptLoadBlock(cachedUrl, offset, length, true, false);
		
			long duration = System.currentTimeMillis() - start;
			
			if ((reader == null) || (duration > locCacheMaxDuration)) {
				locCacheMap.remove(partId, cachedUrl);
			}
			
			if (reader != null) {
				return reader;
			}
		}
		
		ArrayList indexs = new ArrayList();
		
		for (int i = 0; i < locations.length; i++) {
			indexs.add(i);
		}
		if (locations.length > 1) {
			Collections.shuffle(indexs);
		}
		
		final int lastIndex = locations.length - 1;
		
		for (int i = 0; i < indexs.size(); i++) {
			
			int index = indexs.get(i);
			
			// Skip failed cached url
			if ((cachedUrl != null) && locations[index].equals(cachedUrl)) {
				continue;
			}
			
			long start = System.currentTimeMillis();
			
			boolean required = (isRequired() && (i == lastIndex));
			
			reader = blockLoader.attemptLoadBlock(locations[index], offset, length, true, required);
			
			long duration = System.currentTimeMillis() - start;
			
			if (reader != null) {
				String connectedUrl = ((HTTPSeekableLineReader)reader).getConnectedUrl();
				
				if ((duration < locCacheMaxDuration) && (connectedUrl != null)) {
					locCachePut(partId, connectedUrl);
				}
				
				return reader;
			}
		}

		return reader;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy