
org.apache.oodt.cas.pushpull.retrievalmethod.RemoteCrawler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.oodt.cas.pushpull.retrievalmethod;
//OODT imports
import org.apache.oodt.cas.filemgr.structs.exceptions.CatalogException;
import org.apache.oodt.cas.metadata.Metadata;
import org.apache.oodt.cas.protocol.exceptions.ProtocolException;
import org.apache.oodt.cas.pushpull.config.DataFilesInfo;
import org.apache.oodt.cas.pushpull.config.DownloadInfo;
import org.apache.oodt.cas.pushpull.exceptions.AlreadyInDatabaseException;
import org.apache.oodt.cas.pushpull.exceptions.ParserException;
import org.apache.oodt.cas.pushpull.exceptions.RetrievalMethodException;
import org.apache.oodt.cas.pushpull.exceptions.ToManyFailedDownloadsException;
import org.apache.oodt.cas.pushpull.exceptions.UndefinedTypeException;
import org.apache.oodt.cas.pushpull.filerestrictions.FileRestrictions;
import org.apache.oodt.cas.pushpull.filerestrictions.Parser;
import org.apache.oodt.cas.pushpull.filerestrictions.VirtualFile;
import org.apache.oodt.cas.pushpull.filerestrictions.VirtualFileStructure;
import org.apache.oodt.cas.protocol.ProtocolFile;
import org.apache.oodt.cas.protocol.util.ProtocolFileFilter;
import org.apache.oodt.cas.pushpull.protocol.ProtocolPath;
import org.apache.oodt.cas.pushpull.protocol.RemoteSite;
import org.apache.oodt.cas.pushpull.protocol.RemoteSiteFile;
import org.apache.oodt.cas.pushpull.retrievalsystem.DataFileToPropFileLinker;
import org.apache.oodt.cas.pushpull.retrievalsystem.FileRetrievalSystem;
//JDK imports
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.util.List;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author bfoster
* @version $Revision$
*
*
* Describe your class here
*
.
*/
public class RemoteCrawler implements RetrievalMethod {
private static final Logger LOG = Logger.getLogger(RemoteCrawler.class
.getName());
/**
* Starts the crawler and creates a default DirStruct if null was supplied
* in constructor
*
* @throws java.net.MalformedURLException
* @throws org.apache.oodt.cas.pushpull.exceptions.ProtocolException
* @throws org.apache.oodt.cas.pushpull.exceptions.ProtocolFileException
*/
@Override
public void processPropFile(FileRetrievalSystem frs, Parser propFileParser,
File propFile, DataFilesInfo dfi, DataFileToPropFileLinker linker)
throws FileNotFoundException, ParserException, ProtocolException, MalformedURLException,
RetrievalMethodException {
RemoteSite remoteSite;
// parse property file
Metadata fileMetadata = new Metadata();
VirtualFileStructure vfs = propFileParser.parse(new FileInputStream(
propFile), fileMetadata);
// determine RemoteSite
DownloadInfo di = dfi.getDownloadInfo();
if (!di.isAllowAliasOverride()
|| (remoteSite = vfs.getRemoteSite()) == null) {
remoteSite = di.getRemoteSite();
}
// modify vfs to be root based if HOME directory based
if (!vfs.isRootBased()) {
String homeDirPath = frs.getHomeDir(remoteSite).getPath();
VirtualFile root = new VirtualFile(homeDirPath, true);
root.addChild(vfs.getRootVirtualFile());
vfs = new VirtualFileStructure(homeDirPath + "/"
+ vfs.getPathToRoot(), root.getRootDir());
frs.changeToHOME(remoteSite);
}
// initialize variables
final String initialCdPath = vfs.getPathToRoot();
final VirtualFile vf = vfs.getRootVirtualFile();
// change to initial directory (takes care of Linux auto-mounting)
frs.changeToDir(initialCdPath, remoteSite);
// add starting directory to stack
Stack files = new Stack();
files.add(new RemoteSiteFile(frs.getCurrentFile(remoteSite), remoteSite));
// start crawling
while (!files.isEmpty()) {
RemoteSiteFile file = files.peek();
try {
// if directory, then add its children to the crawl list
if (file.isDir()) {
// get next page worth of children
List children = frs.getNextPage(file,
new ProtocolFileFilter() {
@Override
public boolean accept(ProtocolFile pFile) {
return FileRestrictions.isAllowed(new
ProtocolPath(pFile
.getPath(), pFile.isDir()), vf);
}
});
// if directory had more children then add them
if (children.size() > 0) {
files.addAll(children);
}// otherwise remove the directory from the crawl list
else {
files.pop();
}
// if file, then download it
} else {
linker.addPropFileToDataFileLink(propFile, file);
if (!frs.addToDownloadQueue(files.pop(), di
.getRenamingConv(), di.getStagingArea(), dfi
.getQueryMetadataElementName(), di
.deleteFromServer(), fileMetadata)) {
linker.eraseLinks(propFile);
}
}
} catch (ToManyFailedDownloadsException e) {
throw new RetrievalMethodException(
"Connection appears to be down. . .unusual number of download failures. . .stopping : "
+ e.getMessage());
} catch (CatalogException e) {
throw new RetrievalMethodException(
"Failed to communicate with database : "
+ e.getMessage());
} catch (AlreadyInDatabaseException e) {
LOG.log(Level.WARNING, "Skipping file : " + e.getMessage());
} catch (UndefinedTypeException e) {
LOG.log(Level.WARNING, "Skipping file : " + e.getMessage());
} catch (Exception e) {
linker.markAsFailed(propFile, e.getMessage());
throw new RetrievalMethodException("Uknown error accured while downloading "
+ file + " from " + remoteSite + " -- bailing out : "
+ e.getMessage(), e);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy