org.archive.io.Arc2Warc Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-commons Show documentation
Show all versions of heritrix-commons Show documentation
The Archive Commons Code Libraries project contains general Java utility
libraries, as used by the Heritrix crawler and other projects.
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPoolSettings;
import org.archive.io.warc.WARCWriterPoolSettingsData;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.FileUtils;
import org.archive.util.anvl.ANVLRecord;
/**
* Convert ARCs to (sortof) WARCs.
* @author stack
* @version $Date$ $Revision$
*/
public class Arc2Warc {
private static final DateTimeFormatter ARC_DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
.withZone(ZoneOffset.UTC);
protected RecordIDGenerator generator = new UUIDGenerator();
private static void usage(HelpFormatter formatter, Options options,
int exitCode) {
formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
"[--force] ARC_INPUT WARC_OUTPUT", options);
System.exit(exitCode);
}
private static String getRevision() {
return Warc2Arc.parseRevision("$Revision$");
}
public void transform(final File arc, final File warc, final boolean force)
throws IOException {
FileUtils.assertReadable(arc);
if (warc.exists() && !force) {
throw new IOException("Target WARC already exists. " +
"Will not overwrite.");
}
ARCReader reader = ARCReaderFactory.get(arc, false, 0);
transform(reader, warc);
}
protected void transform(final ARCReader reader, final File warc)
throws IOException {
WARCWriter writer = null;
// No point digesting. Digest is available after reading of ARC which
// is too late for inclusion in WARC.
reader.setDigest(false);
try {
BufferedOutputStream bos =
new BufferedOutputStream(new FileOutputStream(warc));
// Get the body of the first ARC record as a String so can dump it
// into first record of WARC.
final Iterator i = reader.iterator();
ARCRecord firstRecord = (ARCRecord)i.next();
ByteArrayOutputStream baos =
new ByteArrayOutputStream((int)firstRecord.getHeader().
getLength());
firstRecord.dump(baos);
// Add ARC first record content as an ANVLRecord.
ANVLRecord ar = new ANVLRecord();
ar.addLabelValue("Filedesc", baos.toString());
List metadata = new ArrayList(1);
metadata.add(ar.toString());
// Now create the writer. If reader was compressed, lets write
// a compressed WARC.
writer = new WARCWriter(
new AtomicInteger(),
bos,
warc,
new WARCWriterPoolSettingsData(
"", "", -1, reader.isCompressed(), null, metadata, generator));
// Write a warcinfo record with description about how this WARC
// was made.
writer.writeWarcinfoRecord(warc.getName(),
"Made from " + reader.getReaderIdentifier() + " by " +
this.getClass().getName() + "/" + getRevision());
for (; i.hasNext();) {
write(writer, (ARCRecord)i.next());
}
} finally {
if (reader != null) {
reader.close();
}
if (writer != null) {
// I don't want the close being logged -- least, not w/o log of
// an opening (and that'd be a little silly for simple script
// like this). Currently, it logs at level INFO so that close
// of files gets written to log files. Up the log level just
// for the close.
Logger l = Logger.getLogger(writer.getClass().getName());
Level oldLevel = l.getLevel();
l.setLevel(Level.WARNING);
try {
writer.close();
} finally {
l.setLevel(oldLevel);
}
}
}
}
protected void write(final WARCWriter writer, final ARCRecord r)
throws IOException {
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setUrl(r.getHeader().getUrl());
recordInfo.setContentStream(r);
recordInfo.setContentLength(r.getHeader().getLength());
recordInfo.setEnforceLength(true);
// convert ARC date to WARC-Date format
String arcDateString = r.getHeader().getDate();
String warcDateString = DateTimeFormatter.ISO_DATE_TIME.format(ARC_DATE_FORMAT.parse(arcDateString));
recordInfo.setCreate14DigitDate(warcDateString);
ANVLRecord ar = new ANVLRecord();
String ip = (String)r.getHeader()
.getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
if (ip != null && ip.length() > 0) {
ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
r.getMetaData();
}
recordInfo.setExtraHeaders(ar);
// enable reconstruction of ARC from transformed WARC
// TODO: deferred for further analysis (see HER-1750)
// ar.addLabelValue("ARC-Header-Line", r.getHeaderString());
// If contentBody > 0, assume http headers. Make the mimetype
// be application/http. Otherwise, give it ARC mimetype.
if (r.getHeader().getContentBegin() > 0) {
recordInfo.setType(WARCRecordType.response);
recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
recordInfo.setRecordId(generator.getRecordID());
} else {
recordInfo.setType(WARCRecordType.resource);
recordInfo.setMimetype(r.getHeader().getMimetype());
recordInfo.setRecordId(((WARCWriterPoolSettings)writer.settings).getRecordIDGenerator().getRecordID());
}
writer.writeRecord(recordInfo);
}
/**
* Command-line interface to Arc2Warc.
*
* @param args Command-line arguments.
* @throws ParseException Failed parse of the command line.
* @throws IOException
* @throws java.text.ParseException
*/
@SuppressWarnings("unchecked")
public static void main(String [] args)
throws ParseException, IOException, java.text.ParseException {
Options options = new Options();
options.addOption(new Option("h","help", false,
"Prints this message and exits."));
options.addOption(new Option("f","force", false,
"Force overwrite of target file."));
PosixParser parser = new PosixParser();
CommandLine cmdline = parser.parse(options, args, false);
List cmdlineArgs = cmdline.getArgList();
Option [] cmdlineOptions = cmdline.getOptions();
HelpFormatter formatter = new HelpFormatter();
// If no args, print help.
if (cmdlineArgs.size() <= 0) {
usage(formatter, options, 0);
}
// Now look at options passed.
boolean force = false;
for (int i = 0; i < cmdlineOptions.length; i++) {
switch(cmdlineOptions[i].getId()) {
case 'h':
usage(formatter, options, 0);
break;
case 'f':
force = true;
break;
default:
throw new RuntimeException("Unexpected option: " +
+ cmdlineOptions[i].getId());
}
}
// If no args, print help.
if (cmdlineArgs.size() != 2) {
usage(formatter, options, 0);
}
(new Arc2Warc()).transform(new File(cmdlineArgs.get(0)),
new File(cmdlineArgs.get(1)), force);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy