All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.crawler.cache.HadoopCrawlCache Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.crawler.cache;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.poi.util.IOUtils;
import org.json.JSONException;
import org.json.JSONObject;

import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.util.ReadWriteLock;

public class HadoopCrawlCache extends CrawlCacheProvider {

	private final static String PATH_HTTP_DOWNLOAD_CACHE = Path.SEPARATOR
			+ "opensearchserver" + Path.SEPARATOR + "http-download-cache";

	private final static String META_EXTENSION = "meta";

	private final static String CONTENT_EXTENSION = "content";

	private final ReadWriteLock rwl = new ReadWriteLock();

	private FileSystem fileSystem;

	private Configuration configuration;

	public HadoopCrawlCache() {
		super(CrawlCacheProviderEnum.HADOOP);
		configuration = null;
		fileSystem = null;
	}

	private String[] configFiles = { "core-default.xml", "core-site.xml" };

	@Override
	public void init(String configString) throws IOException {
		rwl.w.lock();
		try {
			closeNoLock();
			configuration = new Configuration();
			for (String configFile : configFiles)
				configuration.addResource(new Path(configString, configFile));
			fileSystem = FileSystem.get(configuration);
		} finally {
			rwl.w.unlock();
		}
	}

	final private void closeNoLock() {
		if (fileSystem != null) {
			IOUtils.closeQuietly(fileSystem);
			fileSystem = null;
		}
	}

	@Override
	public void close() {
		rwl.w.lock();
		try {
			closeNoLock();
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public String getInfos() throws IOException {
		rwl.r.lock();
		try {
			if (configuration == null)
				return null;
			return configuration.toString();
		} finally {
			rwl.r.unlock();
		}
	}

	private Path uriToPath(URI uri, String extension)
			throws UnsupportedEncodingException {
		String path = super.uriToPath(uri, PATH_HTTP_DOWNLOAD_CACHE, 10,
				Path.SEPARATOR, extension, 32);
		return new Path(path);
	}

	@Override
	public InputStream store(DownloadItem downloadItem) throws IOException,
			JSONException {
		rwl.r.lock();
		try {
			URI uri = downloadItem.getUri();

			Path path = checkPath(uriToPath(uri, META_EXTENSION));
			write(path, downloadItem.getMetaAsJson());
			path = checkPath(uriToPath(uri, CONTENT_EXTENSION));
			InputStream is = downloadItem.getContentInputStream();
			write(path, is);
			IOUtils.closeQuietly(is);
			return fileSystem.open(path);
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public DownloadItem load(URI uri, long expirationTime) throws IOException,
			JSONException, URISyntaxException {
		rwl.r.lock();
		try {
			checkFileSystemAvailable();
			Path path = uriToPath(uri, META_EXTENSION);
			if (!fileSystem.exists(path))
				return null;
			if (expirationTime != 0)
				if (fileSystem.getFileStatus(path).getModificationTime() < expirationTime)
					return null;
			String content = read(path);
			JSONObject json = new JSONObject(content);
			DownloadItem downloadItem = new DownloadItem(uri);
			downloadItem.loadMetaFromJson(json);
			path = uriToPath(uri, CONTENT_EXTENSION);
			downloadItem.setContentInputStream(fileSystem.open(path));
			return downloadItem;
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public boolean flush(URI uri) throws IOException {
		rwl.r.lock();
		try {
			checkFileSystemAvailable();
			Path path = uriToPath(uri, META_EXTENSION);
			boolean deleted = false;
			if (fileSystem.exists(path))
				deleted = fileSystem.delete(path, false) || deleted;
			path = uriToPath(uri, CONTENT_EXTENSION);
			if (fileSystem.exists(path))
				deleted = fileSystem.delete(path, false) || deleted;
			return deleted;
		} finally {
			rwl.r.unlock();
		}
	}

	private final long purge(FileStatus[] files, long expiration)
			throws IOException {
		long count = 0;
		for (FileStatus file : files) {
			if (file.isDirectory()) {
				Path p = file.getPath();
				count += purge(fileSystem.listStatus(p), expiration);
				FileStatus[] fs = fileSystem.listStatus(p);
				if (fs.length == 0)
					if (fileSystem.delete(p, false))
						count++;
			} else {
				if (file.getModificationTime() < expiration)
					if (fileSystem.delete(file.getPath(), false))
						count++;
			}
		}
		return count;
	}

	private void checkFileSystemAvailable() throws IOException {
		if (fileSystem == null)
			throw new IOException("File system not configured");
	}

	@Override
	public long flush(long expiration) throws IOException {
		rwl.r.lock();
		try {
			checkFileSystemAvailable();
			Path path = new Path(PATH_HTTP_DOWNLOAD_CACHE);
			return purge(fileSystem.listStatus(path), expiration);
		} finally {
			rwl.r.unlock();
		}
	}

	private String read(Path path) throws IOException {
		FSDataInputStream in = fileSystem.open(path);
		try {
			return in.readUTF();
		} finally {
			IOUtils.closeQuietly(in);
		}
	}

	private Path checkPath(Path path) throws IOException {
		if (!fileSystem.exists(path)) {
			Path parent = path.getParent();
			if (!fileSystem.exists(parent))
				fileSystem.mkdirs(parent);
		}
		return path;
	}

	private void write(Path path, String content) throws IOException {
		FSDataOutputStream out = fileSystem.create(path, true);
		try {
			out.writeUTF(content);
		} finally {
			IOUtils.closeQuietly(out);
		}
	}

	private void write(Path path, InputStream in) throws IOException {
		FSDataOutputStream out = fileSystem.create(path, true);
		try {
			IOUtils.copy(in, out);
		} finally {
			IOUtils.closeQuietly(out);
		}
	}

	@Override
	public String getConfigurationInformation() {
		return "Please provide the path to the Hadoop configuration (etc) folder";
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy