All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.warc.WarcFileNamingDefault Maven / Gradle / Ivy

Go to download

Used for reading, writing and validating WARC files. Implemented to follow the WARC/1.0 ISO specification as closely as possible.

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.warc;

import java.net.InetAddress;
import java.net.UnknownHostException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Default WARC file naming implementation used for writing to multiple files.
 * Automatically add '.gz' to the extension.
 * Format: prefix-date-sequenceNr-hostname.extension
 *
 * @author nicl
 */
public class WarcFileNamingDefault implements WarcFileNaming {

    /** DateFormat to the following format 'yyyyMMddHHmmss'. */
    protected DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");

    /** Prefix component. */
    protected String filePrefix;

    /** Date component. */
    protected Date date;

    /** Date component converted into a human readable string. */
    protected String dateStr;

    /** Host name component. */
    protected String hostname;

    /** Extension component (including leading "."). */
    protected String extension;

    /**
     * Construct file naming instance.
     * @param filePrefix prefix or null, will default to "JWAT"
     * @param date date or null, if you want current date
     * @param hostname host name or null, if you want to use default local host name
     * @param extension extension or null, will default to ".warc"
     */
    public WarcFileNamingDefault(String filePrefix, Date date, String hostname, String extension) {
        if (filePrefix != null) {
            this.filePrefix = filePrefix;
        } else {
            this.filePrefix = "JWAT";
        }
        if (date != null) {
            this.date = date;
        } else {
            this.date = new Date();
        }
        if (hostname != null ) {
            this.hostname = hostname;
        } else {
            try {
                this.hostname = InetAddress.getLocalHost().getHostName().toLowerCase();
            } catch (UnknownHostException e) {
                this.hostname = "unknown";
            }
        }
        if (extension != null) {
            this.extension = extension;
        } else {
            this.extension = ".warc";
        }
        dateStr = dateFormat.format(this.date);
    }

    @Override
    public boolean supportMultipleFiles() {
        return true;
    }

    @Override
    public String getFilename(int sequenceNr, boolean bCompressed) {
        String filename = filePrefix + "-" + dateStr + "-" + String.format("%05d", sequenceNr++) + "-" + hostname + extension;
        if (bCompressed) {
            filename += ".gz";
        }
        return filename;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy