All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.format.arc.ARCMetaDataParser Maven / Gradle / Ivy

The newest version!
package org.archive.format.arc;

import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.util.Date;

import org.archive.util.DateUtils;
import org.archive.util.StringParse;

public class ARCMetaDataParser implements ARCConstants {
	private boolean strict = false;
	private byte[] buffer;
	
	public ARCMetaDataParser() {
		this(MAX_META_LENGTH);
	}

	public ARCMetaDataParser(int maxLength) {
		buffer = new byte[maxLength];
	}

	public ARCMetaData parse(InputStream is) 
	throws ARCFormatException, IOException {
		return parse(is,strict);
	}
	public ARCMetaData parse(InputStream is, boolean strict) 
	throws ARCFormatException, IOException {
		return parse(is,strict,false);
	}
	public ARCMetaData parseEOFOK(InputStream is) 
	throws ARCFormatException, IOException {
		return parseEOFOK(is,strict);
	}
	public ARCMetaData parseEOFOK(InputStream is, boolean strict) 
	throws ARCFormatException, IOException {
		return parse(is,strict,true);
	}

	public ARCMetaData parse(InputStream is, boolean strict, boolean emptyOK) 
	throws ARCFormatException, IOException {

		ARCMetaData data = new ARCMetaData();

		int bufPos = 0;
		int leadingNL = 0;
		boolean gotNonNL = false;
		boolean gotNL = false;
		boolean sawSpace = false;
		
		int bufLen = buffer.length;
		// TODO: lax on leading newlines?
		while(bufPos < bufLen) {
			int c = is.read();

			if(c == -1) {
				// EOF at start
				if(emptyOK && (bufPos == 0)) {
					return null;
				}
				throw new ARCFormatException("Got EOF before newline");
			} else if(c == 10) {
				if(gotNonNL) {
					gotNL = true;
					break;
				} else {
					leadingNL++;
					continue;
				}
			} else {
				gotNonNL = true;
				if(sawSpace) {
					// check for non-ASCII bytes, they are not allowed after URL
					if((c < 32) || (c > 127)) {
						throw new ARCFormatException("Non-ASCII CHARS in metadata.");
					}
				} else {
					// In the URL field, non-ASCII OK until we see a ' ':
					sawSpace = (c == 32);
				}
			}
			buffer[bufPos] = (byte) (c & 0xff);
			bufPos++;
		}
		if(!gotNL) {
			throw new ARCFormatException("No newline");
		}
		data.setLeadingNL(leadingNL);
		data.setHeaderLength(bufPos + 1);// +1 for the newline

		String line = new String(buffer,0,bufPos,ARC_META_CHARSET);
		String parts[] = line.split(DELIMITER,-1);

		if(parts.length == FIELD_COUNT) {

			parseUrl(data,parts[0],strict);
			parseIP(data,parts[1],strict);
			parseDate(data,parts[2],strict);
			parseMime(data,parts[3],strict);
			parseLength(data,parts[4],strict);

		} else if(strict) {
			throw new ARCFormatException("Wrong number of fields in("+line+")");
		} else if(parts.length < FIELD_COUNT) {
				throw new ARCFormatException("Too few fields in("+line+")");
		} else if(parts.length == FIELD_COUNT + 1) {
			
			// URL IP DATE MIME LEN
			// mostly likely extra is in the MIME, but could be in the URL...
			// for now, let's only allow in the MIME:
			
			parseUrl(data,parts[0],strict);
			parseIP(data,parts[1],strict);
			parseDate(data,parts[2],strict);
			parseMime(data,parts[3]+"%20"+parts[4],strict);
			parseLength(data,parts[5],strict);
	
			
		} else {

			// if they're all "blank" that is, if there are some trailing
			// spaces, that's OK in non-strict mode...
			for(int i=FIELD_COUNT; i < parts.length; i++) {
				if(parts[i].length() != 0) {
					throw new ARCFormatException("Extra fields in("+line+")");
				}
			}
			// TODO: warn
			parseUrl(data,parts[0],strict);
			parseIP(data,parts[1],strict);
			parseDate(data,parts[2],strict);
			parseMime(data,parts[3],strict);
			parseLength(data,parts[4],strict);

		}

		return data;
	}
	private void parseUrl(ARCMetaData data, final String u, boolean strict)
	throws ARCFormatException {
		if(u.length() < 1) {
			throw new ARCFormatException("Bad Url(" + u + ")");
		}
		data.setUrl(u);
	}
	private void parseIP(ARCMetaData data, final String ip, boolean strict)
	throws ARCFormatException {
		if(strict) {
			if(StringParse.isIP(ip)) {
				data.setIP(ip);
			} else {
				throw new ARCFormatException("Bad IP address(" + ip + ")");
			}
		} else {
			if(ip.length() < 1) {
				throw new ARCFormatException("Bad IP(" + ip + ")");
			}
			data.setIP(ip);
		}
	}
	private void parseDate(ARCMetaData data, final String ds, boolean strict)
	throws ARCFormatException {
		try {
			Date d = DateUtils.getDate(ds);
			data.setDateBoth(d,ds);
		} catch (ParseException e) {
			throw new ARCFormatException("Bad date(" + ds + ")");
		}
	}
	private void parseMime(ARCMetaData data, final String m, boolean strict)
	throws ARCFormatException {
		if(strict) {
			if(m.length() < 1) {
				throw new ARCFormatException("Bad Mime(" + m + ")");
			}
			data.setMime(m);
		} else {
			if(m.length() < 1) {
				data.setMime(DEFAULT_MIME);
			} else {
				data.setMime(m);
			}
		}
	}
	private void parseLength(ARCMetaData data, final String ls, boolean strict)
	throws ARCFormatException {

		try {
			long l = Long.valueOf(ls);
			if(l < 0) {
				throw new ARCFormatException("Bad Length(" + ls + ")");
			}
			data.setLength(l);
		} catch(NumberFormatException e) {
			throw new ARCFormatException("Bad Length(" + ls + ")");
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy