Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
uk.bl.wa.indexer.WARCIndexerCommand Maven / Gradle / Ivy
/**
*
*/
package uk.bl.wa.indexer;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2018 The webarchive-discovery project contributors
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* .
* #L%
*/
import java.io.*;
import java.nio.charset.Charset;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.GZIPOutputStream;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.PosixParser;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.util.SurtPrefixSet;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigValueFactory;
import uk.bl.wa.annotation.Annotations;
import uk.bl.wa.annotation.Annotator;
import uk.bl.wa.solr.SolrFields;
import uk.bl.wa.solr.SolrRecord;
import uk.bl.wa.solr.SolrRecordFactory;
import uk.bl.wa.solr.SolrWebServer;
import uk.bl.wa.util.Instrument;
import uk.bl.wa.util.Normalisation;
/**
* @author Andrew Jackson
*
*/
public class WARCIndexerCommand {
private static Log log = LogFactory.getLog(WARCIndexerCommand.class);
static {
Instrument.init();
}
private static final String CLI_USAGE = "[-o ] [-s ] [-t] [-r] [-b ] [WARC File List]";
private static final String CLI_HEADER = "WARCIndexer - Extracts metadata and text from Archive Records";
private static final String CLI_FOOTER = "";
private static boolean debugMode = false;
public static String institution;
public static String collection;
public static String collection_id;
/**
*
* @param args
* @throws NoSuchAlgorithmException
* @throws IOException
* @throws TransformerException
* @throws TransformerFactoryConfigurationError
* @throws SolrServerException
*/
public static void main( String[] args ) throws NoSuchAlgorithmException, IOException, TransformerFactoryConfigurationError, TransformerException {
final long allStart = System.nanoTime();
CommandLineParser parser = new PosixParser();
String outputDir = null;
String solrUrl = null;
String configFile = null;
boolean isTextRequired = false;
boolean slashPages = false;
int batchSize = -1; // No explicit batch size (defaults to 1 if not stated in the conf-file)
String annotationsFile = null;
boolean disableCommit;
Options options = new Options();
options.addOption("o", "output", true,
"The directory to contain the output XML files");
options.addOption("z", "gzip", false,
"Pack the output XML files in a single gzipped XML file (only valid when -o has been specified)");
options.addOption("s", "solr", true,
"The URL of the Solr instance the document should be sent to");
options.addOption("t", "text", false,
"Include text in XML in output files");
options.addOption("r", "slash", false,
"Only process slash (root) pages.");
options.addOption("a", "annotations", true,
"A JSON file containing the annotations to apply during indexing.");
options.addOption("b", "batch", true, "Batch size for submissions.");
options.addOption("c", "config", true, "Configuration to use.");
options.addOption("d", "disable_commit", false,
"Disable client side commits (speeds up indexing at the cost of flush guarantee).");
options.addOption("i", "institution", true, "Institution.");
options.addOption("n", "collection", true, "Collection.");
options.addOption("u", "collection_id", true, "Collection ID.");
try {
// parse the command line arguments
CommandLine line = parser.parse( options, args );
String cli_args[] = line.getArgs();
// Check that a mandatory Archive file(s) has been supplied
if( !( cli_args.length > 0 ) ) {
printUsage( options );
System.exit( 0 );
}
boolean gzip = line.hasOption("z");
// Get the output directory, if set
if(line.hasOption("o")){
outputDir = line.getOptionValue("o");
if(outputDir.endsWith("/")||outputDir.endsWith("\\")){
outputDir = outputDir.substring(0, outputDir.length()-1);
}
outputDir = outputDir + "//";
System.out.println("Output Directory is: " + outputDir + " with gzip=" + gzip);
File dir = new File(outputDir);
if(!dir.exists()){
FileUtils.forceMkdir(dir);
}
}
// Get the Solr Url, if set
if(line.hasOption("s")){
solrUrl = line.getOptionValue("s");
if(solrUrl.contains("\"")){
solrUrl = solrUrl.replaceAll("\"", "");
}
}
// Check if the text field is required in the XML output
if(line.hasOption("t") || line.hasOption("s")){
isTextRequired = true;
}
if( line.hasOption( "r" ) ) {
slashPages = true;
}
if( line.hasOption( "b" ) ) {
batchSize = Integer.parseInt( line.getOptionValue( "b" ) );
}
if (line.hasOption("c")) {
configFile = line.getOptionValue("c");
}
// Check that either an output dir or Solr URL is supplied
if(outputDir == null && solrUrl == null){
System.out.println( "A Solr URL or an Output Directory must be supplied" );
printUsage(options);
System.exit( 0 );
}
// Check that both an output dir and Solr URL are not supplied
if(outputDir != null && solrUrl != null){
System.out.println( "A Solr URL and an Output Directory cannot both be specified" );
printUsage(options);
System.exit( 0 );
}
// Pick up any annotations specified:
if (line.hasOption("a")) {
annotationsFile = line.getOptionValue("a");
}
if (line.hasOption("i")) {
institution = line.getOptionValue("i");
}
if (line.hasOption("n")) {
collection = line.getOptionValue("n");
}
if (line.hasOption("u")) {
collection_id = line.getOptionValue("u");
}
// Check for commit disabling
disableCommit = line.hasOption("d");
parseWarcFiles(configFile, outputDir, gzip, solrUrl, cli_args,
isTextRequired, slashPages, batchSize, annotationsFile,
disableCommit, institution, collection, collection_id);
} catch (org.apache.commons.cli.ParseException e) {
log.error("Parse exception when processing command line arguments: "+e);
} finally {
Instrument.timeRel("WARCIndexerCommand.main#total", allStart);
Instrument.log(true);
}
}
/**
* @param outputDir
* @param args
* @throws NoSuchAlgorithmException
* @throws IOException
* @throws TransformerFactoryConfigurationError
* @throws TransformerException
*/
public static void parseWarcFiles(String configFile, String outputDir, boolean gzip,
String solrUrl, String[] args, boolean isTextRequired,
boolean slashPages, int batchSize, String annotationsFile,
boolean disableCommit, String institution, String collection,
String collection_id)
throws NoSuchAlgorithmException,
TransformerFactoryConfigurationError, TransformerException,
IOException {
long startTime = System.currentTimeMillis();
final long start = System.nanoTime();
// If the Solr URL is set initiate a connections
Config conf = ConfigFactory.load();
if (configFile != null) {
log.info("Loading config from log file: " + configFile);
File configFilePath = new File(configFile);
if (!configFilePath.exists()){
log.error("Config file not found:"+configFile);
System.exit( 0 );
}
conf = ConfigFactory.parseFile(configFilePath);
// ConfigPrinter.print(conf);
// conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
log.info("Loaded warc config.");
log.info(conf.getString("warc.title"));
}
final SolrRecordFactory solrFactory = SolrRecordFactory.createFactory(conf);
if(solrUrl != null) {
conf = conf.withValue(SolrWebServer.CONF_HTTP_SERVER, ConfigValueFactory.fromAnyRef(solrUrl) );
}
// Use config for default value
if (conf.hasPath("warc.solr.disablecommit")) {
disableCommit = disableCommit || conf.getBoolean("warc.solr.disablecommit");
}
if (batchSize == -1) { // Batch size not set as command line, so resolve it from conf with default 1
batchSize = conf.hasPath("warc.solr.batch_size") ? conf.getInt("warc.solr.batch_size") : 1;
}
// Set up the server config:
SolrWebServer solrWeb = new SolrWebServer(conf);
// Also pass config down:
WARCIndexer windex = new WARCIndexer(conf);
// Add in annotations, if set:
if (annotationsFile != null) {
Annotations ann = Annotations.fromJsonFile(annotationsFile);
SurtPrefixSet oaSurts = Annotator
.loadSurtPrefix("openAccessSurts.txt");
windex.setAnnotations(ann, oaSurts);
}
// To be indexed:
ArrayList docs = new ArrayList();
int totInputFile = args.length;
int curInputFile = 1;
Instrument.timeRel("WARCIndexerCommand.main#total",
"WARCIndexerCommand.parseWarcFiles#startup", start);
// Loop through each Warc files
for (int arcsIndex = 0; arcsIndex < args.length; arcsIndex++) {
final long arcStart = System.nanoTime();
String inputFile = args[arcsIndex];
if (!disableCommit) {
// Commit to make sure index is up to date:
commit(solrWeb);
}
System.out.println("Parsing Archive File [" + curInputFile + "/" + totInputFile + "]:" + inputFile);
File inFile = new File(inputFile);
String fileName = inFile.getName();
String outputWarcDir = outputDir + fileName + "//";
Writer zipOut = outputDir == null || !gzip ? null :
new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(
outputDir + fileName + ".xml.gz"))), Charset.forName("utf-8"));
if (zipOut != null) {
zipOut.write("");
}
File dir = new File(outputWarcDir);
if (!dir.exists() && solrUrl == null && zipOut == null) {
FileUtils.forceMkdir(dir);
}
ArchiveReader reader = ArchiveReaderFactory.get(inputFile);
Iterator ir = reader.iterator();
int recordCount = 1;
int lastFailedRecord = 0;
// Iterate though each record in the WARC file
while (ir.hasNext()) {
final long recordStart = System.nanoTime();
ArchiveRecord rec = null;
try {
rec = ir.next();
} catch (RuntimeException e) {
log.warn("Exception on record after rec " + recordCount + " from " + inFile.getName(), e);
if (lastFailedRecord != recordCount) {
lastFailedRecord = recordCount;
continue;
}
log.error("Failed to reach next record, last record already on error - skipping the rest of the records");
break;
}
final String url = Normalisation.sanitiseWARCHeaderValue(rec.getHeader().getUrl());
SolrRecord doc = solrFactory.createRecord(inFile.getName(), rec.getHeader());
log.debug("Processing record for url " + url + " from " + inFile.getName() + " @"
+ rec.getHeader().getOffset());
try {
doc = windex.extract(inFile.getName(), rec, isTextRequired);
} catch (Exception e) {
log.warn("Exception on record " + url + " from " + inFile.getName(), e);
doc.addParseException(e);
continue;
} catch (OutOfMemoryError e) {
log.warn("OutOfMemoryError on record " + url + " from " + inFile.getName(), e);
doc.addParseException(e);
}
Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#fullarcprocess",
"WARCIndexerCommand.parseWarcFiles#solrdocCreation", recordStart);
if (doc != null) {
final long updateStart = System.nanoTime();
File fileOutput = new File(outputWarcDir + "//" + "FILE_" + recordCount + ".xml");
if (!slashPages || (doc.getFieldValue(SolrFields.SOLR_URL_TYPE) != null &&
doc.getFieldValue(SolrFields.SOLR_URL_TYPE).equals(SolrFields.SOLR_URL_TYPE_SLASHPAGE))) {
if (zipOut != null) {
doc.writeXml(zipOut);
} else if (solrUrl == null) {
writeXMLToFile(doc.toXml(), fileOutput);
} else {
docs.add(doc.getSolrDocument());
checkSubmission(solrWeb, docs, batchSize, false);
}
recordCount++;
}
Instrument.timeRel("WARCIndexerCommand.parseWarcFiles#fullarcprocess",
"WARCIndexerCommand.parseWarcFiles#docdelivery", updateStart);
}
}
curInputFile++;
if (zipOut != null) {
zipOut.write(" ");
zipOut.flush();
zipOut.close();
}
Instrument.timeRel("WARCIndexerCommand.main#total",
"WARCIndexerCommand.parseWarcFiles#fullarcprocess", arcStart);
Instrument.log(arcsIndex < args.length-1); // Don't log the last on info to avoid near-duplicate logging
}
// Submit any remaining docs:
checkSubmission(solrWeb, docs, batchSize, true);
if (!disableCommit) {
// Commit the updates:
commit(solrWeb);
}
long endTime = System.currentTimeMillis();
System.out.println("WARC Indexer Finished in " + ((endTime - startTime) / 1000.0) + " seconds.");
}
private static void commit( SolrWebServer solrWeb) {
// Commit any Solr Updates
if( solrWeb != null ) {
try {
final long start = System.nanoTime();
solrWeb.commit();
Instrument.timeRel("WARCIndexerCommand.main#total", "WARCIndexerCommand.commit#success", start);
} catch( SolrServerException s ) {
log.warn( "SolrServerException when committing.", s );
} catch( IOException i ) {
log.warn( "IOException when committing.", i );
}
}
}
/**
* Checks whether a List of SolrInputDocuments has grown large enough to
* be submitted to a SolrWebServer.
*
* @param solr
* @param docs
* @param limit
* @throws SolrServerException
* @throws IOException
*/
private static void checkSubmission(SolrWebServer solr,
List docs, int limit, boolean force) {
if (docs.size() > 0 && (docs.size() >= limit || force)) {
try {
final long start = System.nanoTime();
if (log.isTraceEnabled() || debugMode) {
for (SolrInputDocument doc : docs) {
try {
solr.updateSolrDoc(doc);
} catch (Exception e) {
log.error(
"Failed to post document - got exception: ",
e);
log.error("Failed document was:\n"
+ ClientUtils.toXML(doc));
System.exit(1);
}
}
} else {
solr.add(docs);
}
Instrument.timeRel(
"WARCIndexerCommand.parseWarcFiles#docdelivery",
"WARCIndexerCommanc.checkSubmission#solrSendBatch", start);
docs.clear();
} catch (SolrServerException s) {
log.warn("SolrServerException: ", s);
} catch (IOException i) {
log.warn("IOException: ", i);
}
}
}
public static void prettyPrintXML( String doc ) throws TransformerFactoryConfigurationError, TransformerException {
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
//initialize StreamResult with File object to save to file
StreamResult result = new StreamResult(new StringWriter());
StreamSource source = new StreamSource(new StringReader(doc));
transformer.transform(source, result);
String xmlString = result.getWriter().toString();
System.out.println(xmlString);
}
/**
* @param xml
* @param file
* @throws IOException
* @throws TransformerFactoryConfigurationError
* @throws TransformerException
*/
public static void writeXMLToFile( String xml, File file ) throws IOException, TransformerFactoryConfigurationError, TransformerException {
Result result = new StreamResult(file);
Source source = new StreamSource(new StringReader(xml));
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
//FileUtils.writeStringToFile(file, xml);
transformer.transform(source, result);
}
/**
* @param options
*/
private static void printUsage( Options options ) {
HelpFormatter helpFormatter = new HelpFormatter();
helpFormatter.setWidth( 80 );
helpFormatter.printHelp( CLI_USAGE, CLI_HEADER, options, CLI_FOOTER );
}
}