All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.Arc2Warc Maven / Gradle / Ivy

Go to download

The Archive Commons Code Libraries project contains general Java utility libraries, as used by the Heritrix crawler and other projects.

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import java.io.BufferedOutputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.time.ZoneOffset;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.archive.io.arc.ARCConstants;
import org.archive.io.arc.ARCReader;
import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.arc.ARCRecord;
import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.io.warc.WARCWriterPoolSettings;
import org.archive.io.warc.WARCWriterPoolSettingsData;
import org.archive.uid.RecordIDGenerator;
import org.archive.uid.UUIDGenerator;
import org.archive.util.FileUtils;
import org.archive.util.anvl.ANVLRecord;

/**
 * Convert ARCs to (sortof) WARCs.
 * @author stack
 * @version $Date$ $Revision$
 */
public class Arc2Warc {
    private static final DateTimeFormatter ARC_DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMddHHmmss")
            .withZone(ZoneOffset.UTC);
    protected RecordIDGenerator generator = new UUIDGenerator();
    
    private static void usage(HelpFormatter formatter, Options options,
           int exitCode) {
       formatter.printHelp("java org.archive.io.arc.Arc2Warc " +
       		"[--force] ARC_INPUT WARC_OUTPUT", options);
       System.exit(exitCode);
   }
   
   private static String getRevision() {
       return Warc2Arc.parseRevision("$Revision$");
   }
   
   public void transform(final File arc, final File warc, final boolean force)
   throws IOException {
       FileUtils.assertReadable(arc);
       if (warc.exists() && !force) {
    	   throw new IOException("Target WARC already exists. " +
    	       "Will not overwrite.");
       }

       ARCReader reader = ARCReaderFactory.get(arc, false, 0);
       transform(reader, warc);
   }
   
   protected void transform(final ARCReader reader, final File warc)
   throws IOException {
	   WARCWriter writer = null;
	   // No point digesting. Digest is available after reading of ARC which
	   // is too late for inclusion in WARC.
	   reader.setDigest(false);
	   try {
		   BufferedOutputStream bos =
			   new BufferedOutputStream(new FileOutputStream(warc));
		   // Get the body of the first ARC record as a String so can dump it
		   // into first record of WARC.
		   final Iterator i = reader.iterator();
		   ARCRecord firstRecord = (ARCRecord)i.next();
		   ByteArrayOutputStream baos =
			   new ByteArrayOutputStream((int)firstRecord.getHeader().
			       getLength());
		   firstRecord.dump(baos);
	       // Add ARC first record content as an ANVLRecord.
	       ANVLRecord ar = new ANVLRecord();
	       ar.addLabelValue("Filedesc", baos.toString());
	       List metadata = new ArrayList(1);
	       metadata.add(ar.toString());
	       // Now create the writer.  If reader was compressed, lets write
	       // a compressed WARC.
		   writer = new WARCWriter(
		               new AtomicInteger(),
		               bos, 
		               warc, 
		               new WARCWriterPoolSettingsData(
		                       "", "", -1, reader.isCompressed(), null, metadata, generator));
		   // Write a warcinfo record with description about how this WARC
		   // was made.
		   writer.writeWarcinfoRecord(warc.getName(),
		       "Made from " + reader.getReaderIdentifier() + " by " +
	               this.getClass().getName() + "/" + getRevision());
		   for (; i.hasNext();) {
			   write(writer, (ARCRecord)i.next());
		   }
	   } finally {
		   if (reader != null) {
			   reader.close();
		   }
		   if (writer != null) {
			   // I don't want the close being logged -- least, not w/o log of
			   // an opening (and that'd be a little silly for simple script
			   // like this). Currently, it logs at level INFO so that close
			   // of files gets written to log files.  Up the log level just
			   // for the close.
			   Logger l = Logger.getLogger(writer.getClass().getName());
			   Level oldLevel = l.getLevel();
			   l.setLevel(Level.WARNING);
			   try {
				   writer.close();
			   } finally {
				   l.setLevel(oldLevel);
			   }
		   }
	   }
   }
   
   protected void write(final WARCWriter writer, final ARCRecord r)
   throws IOException {
       WARCRecordInfo recordInfo = new WARCRecordInfo();
       recordInfo.setUrl(r.getHeader().getUrl());
       recordInfo.setContentStream(r);
       recordInfo.setContentLength(r.getHeader().getLength());
       recordInfo.setEnforceLength(true);

       // convert ARC date to WARC-Date format
       String arcDateString = r.getHeader().getDate();

       String warcDateString = DateTimeFormatter.ISO_DATE_TIME.format(ARC_DATE_FORMAT.parse(arcDateString));
       recordInfo.setCreate14DigitDate(warcDateString);

       ANVLRecord ar = new ANVLRecord();
       String ip = (String)r.getHeader()
           .getHeaderValue((ARCConstants.IP_HEADER_FIELD_KEY));
       if (ip != null && ip.length() > 0) {
           ar.addLabelValue(WARCConstants.NAMED_FIELD_IP_LABEL, ip);
           r.getMetaData();
       }
       recordInfo.setExtraHeaders(ar);

       // enable reconstruction of ARC from transformed WARC
       // TODO: deferred for further analysis (see HER-1750) 
       // ar.addLabelValue("ARC-Header-Line", r.getHeaderString());

       // If contentBody > 0, assume http headers.  Make the mimetype
       // be application/http.  Otherwise, give it ARC mimetype.
       if (r.getHeader().getContentBegin() > 0) {
           recordInfo.setType(WARCRecordType.response);
           recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
           recordInfo.setRecordId(generator.getRecordID());
       } else {
           recordInfo.setType(WARCRecordType.resource);
           recordInfo.setMimetype(r.getHeader().getMimetype());
           recordInfo.setRecordId(((WARCWriterPoolSettings)writer.settings).getRecordIDGenerator().getRecordID());
       }

       writer.writeRecord(recordInfo);
   }

   /**
    * Command-line interface to Arc2Warc.
    *
    * @param args Command-line arguments.
    * @throws ParseException Failed parse of the command line.
    * @throws IOException
    * @throws java.text.ParseException
    */
   @SuppressWarnings("unchecked")
public static void main(String [] args)
   throws ParseException, IOException, java.text.ParseException {
       Options options = new Options();
       options.addOption(new Option("h","help", false,
           "Prints this message and exits."));
       options.addOption(new Option("f","force", false,
       	   "Force overwrite of target file."));
       PosixParser parser = new PosixParser();
       CommandLine cmdline = parser.parse(options, args, false);
       List cmdlineArgs = cmdline.getArgList();
       Option [] cmdlineOptions = cmdline.getOptions();
       HelpFormatter formatter = new HelpFormatter();
       
       // If no args, print help.
       if (cmdlineArgs.size() <= 0) {
           usage(formatter, options, 0);
       }

       // Now look at options passed.
       boolean force = false;
       for (int i = 0; i < cmdlineOptions.length; i++) {
           switch(cmdlineOptions[i].getId()) {
               case 'h':
                   usage(formatter, options, 0);
                   break;
                   
               case 'f':
                   force = true;
                   break;
                   
               default:
                   throw new RuntimeException("Unexpected option: " +
                       + cmdlineOptions[i].getId());
           }
       }
       
       // If no args, print help.
       if (cmdlineArgs.size() != 2) {
           usage(formatter, options, 0);
       }
       (new Arc2Warc()).transform(new File(cmdlineArgs.get(0)),
           new File(cmdlineArgs.get(1)), force);
   }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy