com.jaeksoft.searchlib.crawler.file.process.CrawlFileThread Maven / Gradle / Ivy
Show all versions of opensearchserver Show documentation
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2008-2016 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.crawler.file.process;
import com.jaeksoft.searchlib.Logging;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.config.Config;
import com.jaeksoft.searchlib.crawler.common.database.FetchStatus;
import com.jaeksoft.searchlib.crawler.common.database.IndexStatus;
import com.jaeksoft.searchlib.crawler.common.database.ParserStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatistics;
import com.jaeksoft.searchlib.crawler.common.process.CrawlStatus;
import com.jaeksoft.searchlib.crawler.common.process.CrawlThreadAbstract;
import com.jaeksoft.searchlib.crawler.file.database.*;
import com.jaeksoft.searchlib.crawler.file.spider.CrawlFile;
import com.jaeksoft.searchlib.util.InfoCallback;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
public class CrawlFileThread extends CrawlThreadAbstract {
private final CrawlFileMaster crawlMaster;
private final FileCrawlQueue crawlQueue;
private final FileManager fileManager;
private final long delayBetweenAccesses;
private final FilePathItem filePathItem;
private volatile FileItem currentFileItem;
private long nextTimeTarget;
protected CrawlFileThread(Config config, CrawlFileMaster crawlMaster, CrawlStatistics sessionStats,
FilePathItem filePathItem, InfoCallback infoCallback) throws SearchLibException {
super(config, crawlMaster, null, infoCallback);
this.fileManager = config.getFileManager();
this.crawlMaster = (CrawlFileMaster) getThreadMaster();
this.crawlQueue = (FileCrawlQueue) crawlMaster.getCrawlQueue();
currentStats = new CrawlStatistics(sessionStats);
this.delayBetweenAccesses = filePathItem.getDelay();
nextTimeTarget = 0;
this.filePathItem = filePathItem;
this.currentFileItem = null;
}
private void sleepInterval(long max) throws InterruptedException {
long c = System.currentTimeMillis();
long ms = nextTimeTarget - c;
nextTimeTarget = c + delayBetweenAccesses;
if (ms < 0)
return;
if (ms > max)
ms = max;
sleepMs(ms);
}
void browse(final FileInstanceAbstract fileInstance, final Integer depth)
throws SearchLibException, URISyntaxException, NoSuchAlgorithmException, IOException,
InstantiationException, IllegalAccessException, ClassNotFoundException, HttpException,
InterruptedException {
if (isAborted() || crawlMaster.isAborted())
return;
if (fileInstance == null)
return;
FileItem fileItem = fileManager.getNewFileItem(fileInstance);
setCurrentFileItem(fileItem);
FileTypeEnum fileType = fileItem.getFileType();
if (fileType == null)
return;
switch (fileType) {
case directory:
if (depth != null && depth == 0)
break;
FileInstanceAbstract[] files = checkDirectory(fileInstance);
if (files == null)
break;
for (FileInstanceAbstract file : files)
browse(file, depth == null ? null : depth - 1);
break;
case file:
if (!checkFile(fileItem))
return;
break;
default:
return;
}
CrawlFile crawl = crawl(fileInstance, fileItem);
if (crawl != null)
crawlQueue.add(currentStats, crawl);
setStatus(CrawlStatus.INDEXATION);
crawlQueue.index(false);
}
@Override
public void runner() throws Exception {
CrawlFileMaster crawlMaster = (CrawlFileMaster) getThreadMaster();
FileCrawlQueue crawlQueue = (FileCrawlQueue) crawlMaster.getCrawlQueue();
FileInstanceAbstract fileInstance = FileInstanceAbstract.create(filePathItem, null, filePathItem.getPath());
browse(fileInstance, null);
crawlQueue.index(!crawlMaster.isRunning());
}
private CrawlFile crawl(FileInstanceAbstract fileInstance, FileItem fileItem)
throws SearchLibException, InterruptedException {
long startTime = System.currentTimeMillis();
sleepInterval(60000);
setStatus(CrawlStatus.CRAWL);
currentStats.incUrlCount();
CrawlFile crawl = new CrawlFile(fileInstance, fileItem, getConfig(), currentStats);
// Fetch started
currentStats.incFetchedCount();
crawl.download();
if (fileItem.getFetchStatus() == FetchStatus.FETCHED && fileItem.getParserStatus() == ParserStatus.PARSED
&& fileItem.getIndexStatus() != IndexStatus.META_NOINDEX) {
fileItem.setIndexStatus(IndexStatus.TO_INDEX);
currentStats.incParsedCount();
} else
currentStats.incIgnoredCount();
fileItem.setTime((int) (System.currentTimeMillis() - startTime));
return crawl;
}
final private void smartDelete(FileCrawlQueue crawlQueue, FileInfo fileInfo) throws SearchLibException {
crawlQueue.delete(currentStats, fileInfo.getUri());
if (fileInfo.getFileType() != FileTypeEnum.directory)
return;
HashMap indexFileMap = new HashMap();
try {
fileManager.getFileInfoList(new URI(fileInfo.getUri()), indexFileMap);
for (FileInfo fi : indexFileMap.values())
smartDelete(crawlQueue, fi);
} catch (UnsupportedEncodingException e) {
Logging.warn(e);
} catch (URISyntaxException e) {
Logging.warn(e);
}
}
private FileInstanceAbstract[] checkDirectory(FileInstanceAbstract fileInstance)
throws SearchLibException, URISyntaxException, IOException {
// Load directory from Index
HashMap indexFileMap = new HashMap();
fileManager.getFileInfoList(fileInstance.getURI(), indexFileMap);
boolean withSubDir = filePathItem.isWithSubDir();
// If the filePathItem does not support subdir
if (!withSubDir)
for (FileInfo fileInfo : indexFileMap.values())
if (fileInfo.getFileType() == FileTypeEnum.directory)
smartDelete(crawlQueue, fileInfo);
// Remove existing files from the map
FileInstanceAbstract[] files =
withSubDir ? fileInstance.listFilesAndDirectories() : fileInstance.listFilesOnly();
if (files != null)
for (FileInstanceAbstract file : files)
indexFileMap.remove(file.getURI().toASCIIString());
// The file that remain in the map can be removed
if (indexFileMap.size() > 0)
for (FileInfo fileInfo : indexFileMap.values())
smartDelete(crawlQueue, fileInfo);
return files;
}
private boolean checkFile(FileItem fileItem)
throws UnsupportedEncodingException, SearchLibException, URISyntaxException {
FileInfo oldFileInfo = fileManager.getFileInfo(fileItem.getUri());
// The file is a new file
if (oldFileInfo == null) {
return true;
}
// The file has been modified
if (oldFileInfo.isNewCrawlNeeded(fileItem))
return true;
// The file has not changed, we don't need to crawl it
currentStats.incIgnoredCount();
return false;
}
public FileItem getCurrentFileItem() {
return currentFileItem;
}
public void setCurrentFileItem(FileItem item) {
currentFileItem = item;
}
@Override
public String getCurrentInfo() {
FileItem fileItem = currentFileItem;
return fileItem == null ? StringUtils.EMPTY : fileItem.getDirectory();
}
}