org.archive.extract.CDXExtractorOutput Maven / Gradle / Ivy
The newest version!
package org.archive.extract;
import java.io.IOException;
import java.io.PrintStream;
import java.util.List;
import org.archive.format.json.JSONView;
import org.archive.resource.Resource;
import org.archive.util.StreamCopy;
public class CDXExtractorOutput implements ExtractorOutput {
// CANON DATE URL MIME HTTP-CODE SHA1 REDIR OFFSET FILE
private static String URL_SPEC = "Envelope.ARC-Header-Metadata.Target-URI|Envelope.WARC-Header-Metadata.Target-URI";
private static String DATE_SPEC = "Envelope.ARC-Header-Metadata.Date";
private static String MIME_SPEC = "Envelope.ARC-Header-Metadata.Content-Type";
private static String HTTP_CODE_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Response-Message.Status";
private static String SHA1_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Entity-Digest";
private static String REDIR_SPEC = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers.Content-Location";
private static String OFFSET_SPEC = "Container.Offset";
private static String FILENAME_SPEC = "Container.Filename";
private static String SPECS[] = {
URL_SPEC, DATE_SPEC, URL_SPEC, MIME_SPEC, HTTP_CODE_SPEC, SHA1_SPEC,
REDIR_SPEC, OFFSET_SPEC, FILENAME_SPEC
};
private static char EMPTY = '-';
private static char DELIM = ' ';
JSONView view;
private PrintStream out;
public CDXExtractorOutput(PrintStream out) {
view = new JSONView(SPECS);
this.out = out;
}
public void output(Resource resource) throws IOException {
StreamCopy.readToEOF(resource.getInputStream());
List> res = view.apply(resource.getMetaData().getTopMetaData());
StringBuilder sb = new StringBuilder();
for(List actual : res) {
sb.setLength(0);
// boolean first = true;
for(int i = 0; i < actual.size(); i++) {
// actual.set(5, actual.get(5).substring(5));
// for(String f : actual) {
if(i > 0) {
sb.append(DELIM);
}
String f = actual.get(i);
if((f == null) || (f.length() == 0)) {
sb.append(EMPTY);
} else {
if(i == 5) {
sb.append(f.substring(5));
} else {
sb.append(f);
}
}
}
out.println(sb.toString());
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy