All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.digitalpebble.stormcrawler.warc.WARCFileNameFormat Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to DigitalPebble Ltd under one or more contributor license agreements. See the NOTICE
 * file distributed with this work for additional information regarding copyright ownership.
 * DigitalPebble licenses this file to You under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License. You may obtain a copy of the
 * License at
 *
 * 

http://www.apache.org/licenses/LICENSE-2.0 * *

Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.warc; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map; import java.util.TimeZone; import org.apache.storm.hdfs.bolt.format.FileNameFormat; import org.apache.storm.task.TopologyContext; /** * From the WARC specs It is helpful to use practices within an institution that make it unlikely or * impossible to duplicate aggregate WARC file names. The convention used inside the Internet * Archive with ARC files is to name files according to the following pattern: * Prefix-Timestamp-Serial-Crawlhost.warc.gz Prefix is an abbreviation usually reflective of the * project or crawl that created this file. Timestamp is a 14- digit GMT timestamp indicating the * time the file was initially begun. Serial is an increasing serial-number within the process * creating the files, often (but not necessarily) unique with regard to the Prefix. Crawlhost is * the domain name or IP address of the machine creating the file. */ public class WARCFileNameFormat implements FileNameFormat { private int taskIndex; private String path = "/"; private String prefix = "crawl"; private final String extension = ".warc.gz"; /** * Overrides the default prefix. * * @param prefix * @return */ public FileNameFormat withPrefix(String prefix) { this.prefix = prefix; return this; } public FileNameFormat withPath(String path) { this.path = path; return this; } @Override public void prepare(Map conf, TopologyContext topologyContext) { this.taskIndex = topologyContext.getThisTaskIndex(); int totalTasks = topologyContext.getComponentTasks(topologyContext.getThisComponentId()).size(); // single task? let's not bother with the task index in the file name if (totalTasks == 1) { this.taskIndex = -1; } } @Override public String getName(long rotation, long timeStamp) { SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss"); fileDate.setTimeZone(TimeZone.getTimeZone("GMT")); String taskindexString = ""; if (this.taskIndex != -1) { taskindexString = String.format("%02d", this.taskIndex) + "-"; } return this.prefix + "-" + fileDate.format(new Date(timeStamp)) + "-" + taskindexString + String.format("%05d", rotation) + this.extension; } public String getPath() { return this.path; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy