Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.recrawl;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.SerializationUtils;
import org.archive.bdb.BdbModule;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.OneLineSimpleLogger;
import org.archive.util.SURT;
import org.archive.util.bdbje.EnhancedEnvironment;
import org.archive.util.iterator.LineReadingIterator;
import org.json.JSONObject;
import com.sleepycat.bind.serial.SerialBinding;
import com.sleepycat.bind.serial.StoredClassCatalog;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.collections.StoredIterator;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.EnvironmentConfig;
/**
* Superclass for Processors which utilize BDB-JE for URI state
* (including most notably history) persistence.
*
* @author gojomo
*/
public abstract class PersistProcessor extends AbstractPersistProcessor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 1L;
private static final Logger logger =
Logger.getLogger(PersistProcessor.class.getName());
/** name of history Database */
public static final String URI_HISTORY_DBNAME = "uri_history";
public static final BdbModule.BdbConfig HISTORY_DB_CONFIG;
static {
BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig();
dbConfig.setTransactional(false);
dbConfig.setAllowCreate(true);
dbConfig.setDeferredWrite(true);
HISTORY_DB_CONFIG = dbConfig;
}
public PersistProcessor() {
}
/**
* Return a preferred String key for persisting the given CrawlURI's
* AList state.
*
* @param curi CrawlURI
* @return String key
*/
public static String persistKeyFor(CrawlURI curi) {
// use a case-sensitive SURT for uniqueness and sorting benefits
return persistKeyFor(curi.getUURI().toString());
}
public static String persistKeyFor(String uri) {
// use a case-sensitive SURT for uniqueness and sorting benefits
return SURT.fromURI(uri,true);
}
/**
* Copies entries from an existing environment db to a new one. If
* historyMap is not provided, only logs the entries that would have been
* copied.
*
* @param sourceDir existing environment database directory
* @param historyMap new environment db (or null for a dry run)
* @return number of records
* @throws DatabaseException
*/
private static int copyPersistEnv(File sourceDir, StoredSortedMap historyMap)
throws DatabaseException {
int count = 0;
// open the source env history DB, copying entries to target env
EnhancedEnvironment sourceEnv = setupCopyEnvironment(sourceDir, true);
StoredClassCatalog sourceClassCatalog = sourceEnv.getClassCatalog();
DatabaseConfig historyDbConfig = HISTORY_DB_CONFIG.toDatabaseConfig();
historyDbConfig.setReadOnly(true);
Database sourceHistoryDB = sourceEnv.openDatabase(
null, URI_HISTORY_DBNAME, historyDbConfig);
StoredSortedMap sourceHistoryMap = new StoredSortedMap(sourceHistoryDB,
new StringBinding(), new SerialBinding