
com.jaeksoft.searchlib.crawler.cache.HadoopCrawlCache Maven / Gradle / Ivy
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2012-2014 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.cache;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.poi.util.IOUtils;
import org.json.JSONException;
import org.json.JSONObject;
import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem;
import com.jaeksoft.searchlib.util.ReadWriteLock;
public class HadoopCrawlCache extends CrawlCacheProvider {
private final static String PATH_HTTP_DOWNLOAD_CACHE = Path.SEPARATOR
+ "opensearchserver" + Path.SEPARATOR + "http-download-cache";
private final static String META_EXTENSION = "meta";
private final static String CONTENT_EXTENSION = "content";
private final ReadWriteLock rwl = new ReadWriteLock();
private FileSystem fileSystem;
private Configuration configuration;
public HadoopCrawlCache() {
super(CrawlCacheProviderEnum.HADOOP);
configuration = null;
fileSystem = null;
}
private String[] configFiles = { "core-default.xml", "core-site.xml" };
@Override
public void init(String configString) throws IOException {
rwl.w.lock();
try {
closeNoLock();
configuration = new Configuration();
for (String configFile : configFiles)
configuration.addResource(new Path(configString, configFile));
fileSystem = FileSystem.get(configuration);
} finally {
rwl.w.unlock();
}
}
final private void closeNoLock() {
if (fileSystem != null) {
IOUtils.closeQuietly(fileSystem);
fileSystem = null;
}
}
@Override
public void close() {
rwl.w.lock();
try {
closeNoLock();
} finally {
rwl.w.unlock();
}
}
@Override
public String getInfos() throws IOException {
rwl.r.lock();
try {
if (configuration == null)
return null;
return configuration.toString();
} finally {
rwl.r.unlock();
}
}
private Path uriToPath(URI uri, String extension)
throws UnsupportedEncodingException {
String path = super.uriToPath(uri, PATH_HTTP_DOWNLOAD_CACHE, 10,
Path.SEPARATOR, extension, 32);
return new Path(path);
}
@Override
public InputStream store(DownloadItem downloadItem) throws IOException,
JSONException {
rwl.r.lock();
try {
URI uri = downloadItem.getUri();
Path path = checkPath(uriToPath(uri, META_EXTENSION));
write(path, downloadItem.getMetaAsJson());
path = checkPath(uriToPath(uri, CONTENT_EXTENSION));
InputStream is = downloadItem.getContentInputStream();
write(path, is);
IOUtils.closeQuietly(is);
return fileSystem.open(path);
} finally {
rwl.r.unlock();
}
}
@Override
public DownloadItem load(URI uri, long expirationTime) throws IOException,
JSONException, URISyntaxException {
rwl.r.lock();
try {
checkFileSystemAvailable();
Path path = uriToPath(uri, META_EXTENSION);
if (!fileSystem.exists(path))
return null;
if (expirationTime != 0)
if (fileSystem.getFileStatus(path).getModificationTime() < expirationTime)
return null;
String content = read(path);
JSONObject json = new JSONObject(content);
DownloadItem downloadItem = new DownloadItem(uri);
downloadItem.loadMetaFromJson(json);
path = uriToPath(uri, CONTENT_EXTENSION);
downloadItem.setContentInputStream(fileSystem.open(path));
return downloadItem;
} finally {
rwl.r.unlock();
}
}
@Override
public boolean flush(URI uri) throws IOException {
rwl.r.lock();
try {
checkFileSystemAvailable();
Path path = uriToPath(uri, META_EXTENSION);
boolean deleted = false;
if (fileSystem.exists(path))
deleted = fileSystem.delete(path, false) || deleted;
path = uriToPath(uri, CONTENT_EXTENSION);
if (fileSystem.exists(path))
deleted = fileSystem.delete(path, false) || deleted;
return deleted;
} finally {
rwl.r.unlock();
}
}
private final long purge(FileStatus[] files, long expiration)
throws IOException {
long count = 0;
for (FileStatus file : files) {
if (file.isDirectory()) {
Path p = file.getPath();
count += purge(fileSystem.listStatus(p), expiration);
FileStatus[] fs = fileSystem.listStatus(p);
if (fs.length == 0)
if (fileSystem.delete(p, false))
count++;
} else {
if (file.getModificationTime() < expiration)
if (fileSystem.delete(file.getPath(), false))
count++;
}
}
return count;
}
private void checkFileSystemAvailable() throws IOException {
if (fileSystem == null)
throw new IOException("File system not configured");
}
@Override
public long flush(long expiration) throws IOException {
rwl.r.lock();
try {
checkFileSystemAvailable();
Path path = new Path(PATH_HTTP_DOWNLOAD_CACHE);
return purge(fileSystem.listStatus(path), expiration);
} finally {
rwl.r.unlock();
}
}
private String read(Path path) throws IOException {
FSDataInputStream in = fileSystem.open(path);
try {
return in.readUTF();
} finally {
IOUtils.closeQuietly(in);
}
}
private Path checkPath(Path path) throws IOException {
if (!fileSystem.exists(path)) {
Path parent = path.getParent();
if (!fileSystem.exists(parent))
fileSystem.mkdirs(parent);
}
return path;
}
private void write(Path path, String content) throws IOException {
FSDataOutputStream out = fileSystem.create(path, true);
try {
out.writeUTF(content);
} finally {
IOUtils.closeQuietly(out);
}
}
private void write(Path path, InputStream in) throws IOException {
FSDataOutputStream out = fileSystem.create(path, true);
try {
IOUtils.copy(in, out);
} finally {
IOUtils.closeQuietly(out);
}
}
@Override
public String getConfigurationInformation() {
return "Please provide the path to the Hadoop configuration (etc) folder";
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy