All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.hadoop.ArchiveJSONViewLoader Maven / Gradle / Ivy

The newest version!
package org.archive.hadoop;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.lang.StringUtils;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.archive.format.json.JSONView;
import org.json.JSONException;
import org.json.JSONObject;

public class ArchiveJSONViewLoader extends ArchiveMetadataLoader {
	private final static Logger LOG = 
		Logger.getLogger(ArchiveJSONViewLoader.class.getName());

	protected TupleFactory mCacheTupleFactory = TupleFactory.getInstance();
	private ArrayList mCacheProtoTuple = null;
	private JSONView view;
//	private static final List EMPTY;
//	static {
//		EMPTY = new ArrayList();
//		EMPTY.add("");
//	}
//	ArrayList fields;
	ArrayList cached;

	public ArchiveJSONViewLoader(String... fieldArgs) {
		super();
		// TODO: fix this logging...
	    Logger.getLogger("org.archive").setLevel(Level.WARNING);
		mCacheProtoTuple = new ArrayList();
		cached = null;
		if(fieldArgs.length == 0) {
			LOG.info("Constructed with NO foo");
			throw new RuntimeException("No field definition");
		} else {
			if(LOG.isLoggable(Level.INFO)) {
				LOG.info("ArchiveJSONViewLoader:(" +
						StringUtils.join(fieldArgs,",") +
						")");
			}
			view = new JSONView(fieldArgs);
		}
	}

	@Override
	public Tuple getNext() throws IOException {
		if(cached == null) {
			// try to load some more:
			Tuple inner = super.getNext();
			if(inner != null) {
				cached = applyView(inner);
			}
		}
		if(cached != null) {
			Tuple n = cached.remove(0);
			if(cached.size() == 0) {
				cached = null;
			}
			return n;
		}
		// all done
		return null;
	}

	private ArrayList applyView(Tuple inner) {
		// [0] is the JSON. Remaining elements are Strings describing paths
		// into the JSON to "flatten" into a single tuple:
		if(inner == null || inner.size() == 0) {
			return null;
		}
		try {
			JSONObject json = new JSONObject(inner.get(2).toString());
			List> matches = view.apply(json);
			if(matches.size() == 0) {
				return null;
			}
			ArrayList results = new ArrayList();
			for(List t : matches) {
				mCacheProtoTuple.addAll(t);
				Tuple tup = mCacheTupleFactory.newTuple(mCacheProtoTuple);
				mCacheProtoTuple.clear();
				results.add(tup);
			}
			return results;
		} catch (JSONException e) {
			LOG.warning("Failed to parse JSON:"+e.getMessage());
		} catch (ExecException e) {
			LOG.warning("ExecException:"+e.getMessage());
		}
		
		return null;
	}

}