All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.cloud.io.twitter.TwitterJSONOutputHandler Maven / Gradle / Ivy

Go to download

Document Format plugin to support reading and writing Twitter style JSON files

The newest version!
package gate.cloud.io.twitter;

import static gate.cloud.io.IOConstants.PARAM_FILE_EXTENSION;

import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonGenerator.Feature;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.GateConstants;
import gate.Utils;
import gate.annotation.AnnotationSetImpl;
import gate.cloud.batch.DocumentID;
import gate.cloud.io.file.AbstractFileOutputHandler;
import gate.corpora.DocumentJsonUtils;
import gate.corpora.export.TwitterJsonExporter;
import gate.util.GateException;
import gate.util.OffsetComparator;

public class TwitterJSONOutputHandler extends AbstractFileOutputHandler {

	private TwitterJsonExporter exporter = new TwitterJsonExporter();

	private static final JsonFactory JSON_FACTORY = new JsonFactory().enable(Feature.AUTO_CLOSE_TARGET);

	@Override
	protected void configImpl(Map configData) throws IOException, GateException {
		// make sure we default to .json as the extension
		if (!configData.containsKey(PARAM_FILE_EXTENSION)) {
			configData.put(PARAM_FILE_EXTENSION, ".json");
		}

		super.configImpl(configData);
	}

	@Override
	protected void outputDocumentImpl(Document document, DocumentID documentId) throws IOException, GateException {

		Map> annotationSetsMap = collectAnnotations(document);

		Map> originalMap = annotationSetsMap;
		annotationSetsMap = new HashMap>();
		for (Collection annSet : originalMap.values()) {
			for (Annotation a : annSet) {
				Collection annsByType = annotationSetsMap.get(a.getType());
				if (annsByType == null) {
					annsByType = new AnnotationSetImpl(document);
					annotationSetsMap.put(a.getType(), annsByType);
				}
				annsByType.add(a);
			}
		}

		AnnotationSet originalMarkups = document.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);

		OutputStream outputStream = getFileOutputStream(documentId);
		OutputStreamWriter writer = new OutputStreamWriter(outputStream,
				(encoding == null || encoding.length() == 0 ? "UTF-8" : encoding));
		JsonGenerator generator = JSON_FACTORY.createGenerator(writer);

		try {

			if (originalMarkups.get("TweetSegment").isEmpty()) {
				Comparator comparator = new OffsetComparator();
				for (Map.Entry> entry : annotationSetsMap.entrySet()) {
					List list = new ArrayList();
					list.addAll(entry.getValue());
					Collections.sort(list, comparator);
					entry.setValue(list);
				}
				
				AnnotationSet documentAnnotationSet = originalMarkups.get("Tweet");
				if (documentAnnotationSet.size() > 1) {
					throw new GateException("Found more than one Tweet annotation for document " + documentId);
				}
				if (documentAnnotationSet.size() > 0) {
					Annotation documentAnnotation = Utils.getOnlyAnn(documentAnnotationSet);
					DocumentJsonUtils.writeDocument(document, Utils.start(documentAnnotation),
							Utils.end(documentAnnotation), annotationSetsMap, documentAnnotation.getFeatures(), null,
							generator);
					return;
				}

				// if we get here we either didn't have documentAnnotationType
				// set, or it was set but the document contained no such
				// annotation - simply output the whole document with no extra
				// features.
				DocumentJsonUtils.writeDocument(document, 0L, Utils.end(document), annotationSetsMap, null, null,
						generator);

			} else {
				// we have an actual tweet object we can do something with

				// create a FeatureMap of options to drive the underlying Twitter exporter
				FeatureMap options = Factory.newFeatureMap();
				options.put("annotationsMap", annotationSetsMap);

				// export the document
				exporter.export(document, generator, options);
			}

		} finally {
			generator.close();
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy