All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.cache.LocalFileCrawlCache Maven / Gradle / Ivy

/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.cache;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.commons.io.FileUtils;
import org.apache.poi.util.IOUtils;
import org.json.JSONException;
import org.json.JSONObject;

import com.jaeksoft.searchlib.ClientFactory;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.util.ReadWriteLock;

public class LocalFileCrawlCache extends CrawlCacheProvider {

	private final ReadWriteLock rwl = new ReadWriteLock();

	private String rootPath = null;

	public LocalFileCrawlCache() {
		super(CrawlCacheProviderEnum.LOCAL_FILE);
	}

	@Override
	public void close() {
		rwl.w.lock();
		try {
			rootPath = null;
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public String getInfos() throws IOException {
		rwl.r.lock();
		try {
			return rootPath;
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public void init(String configString) throws IOException {
		rwl.w.lock();
		try {
			File f = new File(configString);
			if (!f.exists()) {
				ClientFactory.INSTANCE.properties.checkChroot(f);
				f.mkdirs();
			}
			if (!f.exists())
				throw new IOException("The folder " + f.getAbsolutePath()
						+ " does not exists");
			if (!f.isDirectory())
				throw new IOException("The folder " + f.getAbsolutePath()
						+ " does not exists");
			rootPath = f.getAbsolutePath();
		} finally {
			rwl.w.unlock();
		}
	}

	private final static String PATH_HTTP_DOWNLOAD_CACHE = File.separator
			+ "http-download-cache";

	private final static String META_EXTENSION = "meta";

	private final static String CONTENT_EXTENSION = "content";

	private File uriToFile(URI uri, String extension)
			throws UnsupportedEncodingException {
		String path = super.uriToPath(uri, rootPath + File.separator
				+ PATH_HTTP_DOWNLOAD_CACHE, 10, File.separator, extension, 32);
		return new File(path);
	}

	private File checkPath(File file) throws IOException {
		if (!file.exists()) {
			File parent = file.getParentFile();
			if (!parent.exists())
				parent.mkdirs();
		}
		return file;
	}

	@Override
	public InputStream store(DownloadItem downloadItem) throws IOException,
			JSONException {
		rwl.r.lock();
		try {
			URI uri = downloadItem.getUri();
			File file = checkPath(uriToFile(uri, META_EXTENSION));
			FileUtils.writeStringToFile(file, downloadItem.getMetaAsJson());
			file = checkPath(uriToFile(uri, CONTENT_EXTENSION));
			InputStream is = downloadItem.getContentInputStream();
			FileUtils.copyInputStreamToFile(is, file);
			IOUtils.closeQuietly(is);
			return new FileInputStream(file);
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public DownloadItem load(URI uri, long expirationTime) throws IOException,
			JSONException, URISyntaxException {
		rwl.r.lock();
		try {
			File file = uriToFile(uri, META_EXTENSION);
			if (!file.exists())
				return null;
			if (expirationTime != 0)
				if (file.lastModified() < expirationTime)
					return null;
			String content = FileUtils.readFileToString(file);
			JSONObject json = new JSONObject(content);
			DownloadItem downloadItem = new DownloadItem(uri);
			downloadItem.loadMetaFromJson(json);
			file = uriToFile(uri, CONTENT_EXTENSION);
			downloadItem.setContentInputStream(new FileInputStream(file));
			return downloadItem;
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public boolean flush(URI uri) throws IOException {
		rwl.r.lock();
		try {
			File file = uriToFile(uri, META_EXTENSION);
			boolean deleted = false;
			if (file.exists())
				deleted = file.delete() || deleted;
			file = uriToFile(uri, CONTENT_EXTENSION);
			if (file.exists())
				deleted = file.delete() || deleted;
			return deleted;
		} finally {
			rwl.r.unlock();
		}
	}

	private final long purge(File[] files, long expiration) throws IOException {
		if (files == null)
			return 0;
		long count = 0;
		for (File file : files) {
			if (file.isDirectory()) {
				count += purge(file.listFiles(), expiration);
				File[] fs = file.listFiles();
				if (fs == null)
					continue;
				if (fs.length == 0)
					if (file.delete())
						count++;
			} else {
				if (file.lastModified() < expiration)
					if (file.delete())
						count++;
			}
		}
		return count;
	}

	@Override
	public long flush(long expiration) throws IOException {
		rwl.r.lock();
		try {
			File file = new File(rootPath + File.separator
					+ PATH_HTTP_DOWNLOAD_CACHE);
			return purge(file.listFiles(), expiration);
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public String getConfigurationInformation() {
		return "Please provide the path of the cache directory (Eg.: /var/local/oss_crawl_cache)";
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy