fr.univnantes.termsuite.io.json.JsonTerminologyIO Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright 2015-2016 - CNRS (Centre National de Recherche Scientifique)
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *******************************************************************************/
package fr.univnantes.termsuite.io.json;

import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonToken;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import fr.univnantes.termsuite.engines.gatherer.PropertyValue;
import fr.univnantes.termsuite.framework.TermSuiteFactory;
import fr.univnantes.termsuite.index.Terminology;
import fr.univnantes.termsuite.model.Component;
import fr.univnantes.termsuite.model.CompoundType;
import fr.univnantes.termsuite.model.ContextVector;
import fr.univnantes.termsuite.model.Document;
import fr.univnantes.termsuite.model.IndexedCorpus;
import fr.univnantes.termsuite.model.Lang;
import fr.univnantes.termsuite.model.OccurrenceStore;
import fr.univnantes.termsuite.model.Property;
import fr.univnantes.termsuite.model.PropertyHolder;
import fr.univnantes.termsuite.model.Relation;
import fr.univnantes.termsuite.model.RelationProperty;
import fr.univnantes.termsuite.model.RelationType;
import fr.univnantes.termsuite.model.Term;
import fr.univnantes.termsuite.model.TermBuilder;
import fr.univnantes.termsuite.model.TermOccurrence;
import fr.univnantes.termsuite.model.TermProperty;
import fr.univnantes.termsuite.model.TermWord;
import fr.univnantes.termsuite.model.Word;
import fr.univnantes.termsuite.model.WordBuilder;

public class JsonTerminologyIO {
	
	private static final Logger LOGGER = LoggerFactory.getLogger(JsonTerminologyIO.class);
	
	/*
	 * Error messages for parsing
	 */
	private static final String MSG_EXPECT_PROP_FOR_VAR = "Expecting %s property for term variation";
	private static final String MSG_EXPECT_PROP_FOR_OCCURRENCE = "Expecting %s property for occurrence";
	private static final String MSG_EXPECT_PROP_FOR_TERM_WORD = "Expecting %s property for term word";
	private static final String MSG_NO_GROUPING_KEY_SET = "No GROUPING_KEY set for term";
	private static final String MSG_NO_FILE_SOURCE_WITH_ID = "No file source with id: %s";
	private static final String MSG_NO_GKEY_FOR_TERM = "No grouping found for current term.";
	private static final String MSG_WORD_NOT_FOUND = "No such word: %s";
	private static final String MSG_NO_SUCH_TERM_IN_TERMINO = "No such term in terminology: %s";

	/*
	 * Occurrence storing options
	 */
	private static final String OCCURRENCE_STORAGE_EMBEDDED = "embedded";
	private static final String OCCURRENCE_STORAGE_DISK = "disk";
	
	/*
	 * Json properties
	 */
	
	/*
	 * Term data model
	 */
	private static final String TERM_WORDS = "words";
	private static final String TERM_OCCURRENCES = "occurrences";
	private static final String TERM_CONTEXT = "context";
	
	private static final String TERM_RELATIONS = "relations";
	private static final String METADATA = "metadata";
	private static final String LANG = "lang";
	private static final String NAME = "name";
	private static final String CORPUS_ID = "corpus-id";
	private static final String LEMMA = "lemma";
	private static final String SUBSTRING = "substring";
	private static final String STEM = "stem";
	private static final String COMPOUND_TYPE = "compound_type";
	private static final String COMPOUND_NEOCLASSICAL_AFFIX = "neoAffix";
	private static final String COMPONENTS = "components";
	private static final String BEGIN = "begin";
	private static final String TERM_ID = "tid";
	private static final String END = "end";
	private static final String TERMS = "terms";
	private static final String SYN = "syn";
	private static final String RELATION_TYPE = "type";
	private static final String PROPERTIES = "props";
	private static final String IS_SWT = "swt";
	
	private static final String FROM = "from";
	
	private static final String TO = "to";

	private static final String TEXT = "text";
	private static final String INPUT_SOURCES = "input_sources";
	private static final String FILE = "file";
	private static final String CO_OCCURRENCES = "cooccs";
	private static final String NB_COCCS = "cnt";
	private static final String ASSOC_RATE = "assoc_rate";
	private static final String CO_TERM = "co_term";
	private static final String TOTAL_COOCCURRENCES = "total_cooccs";
	private static final String OCCURRENCE_STORAGE = "occurrence_storage";
	private static final String OCCURRENCE_PERSITENT_STORE_PATH = "persistent_store_path";

	private static final String NB_WORD_ANNOTATIONS = "wordsNum";
	private static final String NB_SPOTTED_TERMS = "spottedTermsNum";

	
	
	/**
	 * Loads the json-serialized term index into the param {@link Terminology} object.
	 * 
	 * @param reader
	 * @param options
	 * 			The deserialization {@link IOOptions}.
	 * @return
	 * @throws JsonParseException
	 * @throws IOException
	 */
	public static IndexedCorpus load(Reader reader, JsonOptions options) throws IOException {
		Terminology termino = null;
		OccurrenceStore occurrenceStore = null;
		IndexedCorpus indexedCorpus = null;
		JsonFactory jsonFactory = new JsonFactory(); 
		JsonParser jp = jsonFactory.createParser(reader); // or Stream, Reader
		jp.enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES);
		jp.enable(JsonParser.Feature.STRICT_DUPLICATE_DETECTION);
		String fieldname;
		String compLemma = null;
		String substring = null;
		int fileSource = -1;
		String wordLemma = null;
		boolean isSWT;
		String syntacticLabel = null;
		boolean neoclassicalAffix = false;
		int begin = -1;
		int end = -1;
		int nbWordAnnos = -1;
		int nbSpottedTerms = -1;
		Term b;
		Term v;
		String text;
		String base;
		String variant;
//		String rule;
		String relationType;

		Map inputSources = Maps.newTreeMap();
		
		
		Map> contextVectors = Maps.newHashMap();
		
		
		// useful var for debug
		JsonToken tok;

		Lang lang = null;
		
		while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
			 
			fieldname = jp.getCurrentName();
			if (METADATA.equals(fieldname)) {
				jp.nextToken();
				String terminoName = null;
				String corpusID = null;
				String occurrenceStorage = null;
				String persitentStorePath = null;

				while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
					fieldname = jp.getCurrentName();
					if (LANG.equals(fieldname)) {
						lang = Lang.forName(jp.nextTextValue());
					} else if (NAME.equals(fieldname)) {
						terminoName = jp.nextTextValue();
					} else if (NB_WORD_ANNOTATIONS.equals(fieldname)) {
						nbWordAnnos = jp.nextIntValue(-1);
					} else if (NB_SPOTTED_TERMS.equals(fieldname)) {
						nbSpottedTerms = jp.nextIntValue(-1);
					} else if (CORPUS_ID.equals(fieldname)) {
						corpusID = jp.nextTextValue();
					} else if (OCCURRENCE_STORAGE.equals(fieldname)) {
						occurrenceStorage = jp.nextTextValue();
					} else if (OCCURRENCE_PERSITENT_STORE_PATH.equals(fieldname)) {
						persitentStorePath = jp.nextTextValue();
					}
				}
				Preconditions.checkState(lang != null, "The property meta.lang must be defined");
				Preconditions.checkState(terminoName != null, "The property meta.name must be defined");
				
				if(occurrenceStorage != null && occurrenceStorage.equals(OCCURRENCE_STORAGE_DISK)) {
					Preconditions.checkNotNull(persitentStorePath, "Missing attribute " + OCCURRENCE_PERSITENT_STORE_PATH);
					Preconditions.checkNotNull(lang, "Missing metadata attribute " + LANG);
					occurrenceStore = TermSuiteFactory.createPersitentOccurrenceStore(persitentStorePath, lang);
				} else {
					Preconditions.checkNotNull(lang, "Missing metadata attribute " + LANG);
					occurrenceStore = TermSuiteFactory.createMemoryOccurrenceStore(lang);
				}
				termino = TermSuiteFactory.createTerminology(lang, terminoName);
				if(corpusID != null)
					termino.setCorpusId(corpusID);
				if(nbWordAnnos != -1)
					termino.setNbWordAnnotations(new AtomicLong(nbWordAnnos));
				if(nbSpottedTerms != -1)
					termino.setNbSpottedTerms(new AtomicLong(nbSpottedTerms));
				
				indexedCorpus = new IndexedCorpus(termino, occurrenceStore);
				if(options.isMetadataOnly()) 
					return indexedCorpus;

			} else if (TERM_WORDS.equals(fieldname)) {
				jp.nextToken();
				while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
					WordBuilder wordBuilder = WordBuilder.start(termino);
					while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
						fieldname = jp.getCurrentName();
						if (LEMMA.equals(fieldname)) 
							wordBuilder.setLemma(jp.nextTextValue());
						else if (COMPOUND_TYPE.equals(fieldname)) 
							wordBuilder.setCompoundType(CompoundType.fromName(jp.nextTextValue()));
						else if (STEM.equals(fieldname)) 
							wordBuilder.setStem(jp.nextTextValue());
						else if (COMPONENTS.equals(fieldname)) {
							while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
								while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
									fieldname = jp.getCurrentName();
									if (LEMMA.equals(fieldname)) 
										compLemma = jp.nextTextValue();
									else if (SUBSTRING.equals(fieldname)) 
										substring = jp.nextTextValue();
									else if (BEGIN.equals(fieldname)) 
										begin = jp.nextIntValue(-2);
									else if (COMPOUND_NEOCLASSICAL_AFFIX.equals(fieldname)) 
										neoclassicalAffix = jp.nextBooleanValue();
									else if (END.equals(fieldname)) 
										end = jp.nextIntValue(-2);
								}
								wordBuilder.addComponent(begin, end, substring, compLemma, neoclassicalAffix);
							}
						}
					}
					Word word = wordBuilder.create();
					termino.getWords().put(word.getLemma(), word);
				}
			} else if (TERMS.equals(fieldname)) {
				jp.nextToken();
				while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) { 
					TermBuilder builder = TermBuilder.start(termino);
					List currentContextVector = Lists.newArrayList();
					Map properties = null;
					String currentGroupingKey = null;
					while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
						fieldname = jp.getCurrentName();
						if (PROPERTIES.equals(fieldname)) {
							properties = readProperties(TermProperty.class, jp);
							Preconditions.checkState(properties.containsKey(TermProperty.GROUPING_KEY), MSG_NO_GROUPING_KEY_SET);
							currentGroupingKey = (String)properties.get(TermProperty.GROUPING_KEY);
						} else if (TERM_WORDS.equals(fieldname)) {
							while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
								wordLemma = null;
								syntacticLabel = null;
								isSWT = false;
								while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
									fieldname = jp.getCurrentName();
									if (LEMMA.equals(fieldname)) 
										wordLemma = jp.nextTextValue();
									else if (IS_SWT.equals(fieldname)) 
										isSWT = jp.nextBooleanValue();
									else if (SYN.equals(fieldname)) 
										syntacticLabel = jp.nextTextValue();
								}
								Preconditions.checkState(wordLemma != null, MSG_EXPECT_PROP_FOR_TERM_WORD, LEMMA);
								Preconditions.checkState(termino.getWords().containsKey(wordLemma), MSG_WORD_NOT_FOUND, wordLemma);
								Preconditions.checkState(syntacticLabel != null, MSG_EXPECT_PROP_FOR_TERM_WORD, SYN);
								builder.addWord(termino.getWords().get(wordLemma), syntacticLabel, isSWT);
							}
						} else if (TERM_CONTEXT.equals(fieldname)) {
							@SuppressWarnings("unused")
							int totalCooccs = 0;
							while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
								fieldname = jp.getCurrentName();
								if (TOTAL_COOCCURRENCES.equals(fieldname)) 
									/*
									 * value never used since the total will 
									 * be reincremented in the contextVector
									 */
									totalCooccs = jp.nextIntValue(-1);
								else if (CO_OCCURRENCES.equals(fieldname)) {
									jp.nextToken();
									while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
										TempVecEntry entry = new TempVecEntry();
										while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
											fieldname = jp.getCurrentName();
											if (NB_COCCS.equals(fieldname)) 
												entry.setNbCooccs(jp.nextIntValue(-1));
											else if (ASSOC_RATE.equals(fieldname)) {
												jp.nextToken();
												entry.setAssocRate(jp.getFloatValue());
											} else if (CO_TERM.equals(fieldname)) 
												entry.setTermGroupingKey(jp.nextTextValue());
											else if (FILE.equals(fieldname)) {
												fileSource = jp.nextIntValue(-1);
											}
										}
										currentContextVector.add(entry);
									}
								}
							} 
						} else
							throw new IllegalStateException("Unexpected field name for term: " + fieldname);
						//end if fieldname
							 
					} // end term object
					Preconditions.checkState(currentGroupingKey != null, MSG_NO_GKEY_FOR_TERM);
					Term t = builder.create();
					t.setProperties(properties);
					termino.getTerms().put(t.getGroupingKey(), t);

					if(options.isWithContexts())
						contextVectors.put(currentGroupingKey, currentContextVector);

				}// end array of terms
				
			} else if (INPUT_SOURCES.equals(fieldname)) {
				jp.nextToken();
				while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
					String id = jp.getCurrentName();
					try {
						inputSources.put(Integer.parseInt(id),jp.nextTextValue());
					} catch(NumberFormatException e) {
						IOUtils.closeQuietly(jp);
						throw new IllegalArgumentException("Bad format for input source key: " + id);
					} 
				}
			} else if (TERM_RELATIONS.equals(fieldname)) {
				jp.nextToken();
				while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
					base = null;
					variant = null;
					relationType = null;
					Map properties = new HashMap<>();
					while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
						fieldname = jp.getCurrentName();
						if (FROM.equals(fieldname)) 
							base = jp.nextTextValue();
						else if (TO.equals(fieldname)) 
							variant = jp.nextTextValue();
						else if (RELATION_TYPE.equals(fieldname)) 
							relationType = jp.nextTextValue();
						else if (PROPERTIES.equals(fieldname)) 
							properties = readProperties(RelationProperty.class, jp);
					}
					
					Preconditions.checkNotNull(base, MSG_EXPECT_PROP_FOR_VAR, FROM);
					Preconditions.checkNotNull(variant, MSG_EXPECT_PROP_FOR_VAR, TO);
					b = termino.getTerms().get(base);
					v = termino.getTerms().get(variant);
					if(b != null && v != null) {
						
						RelationType vType = RelationType.fromShortName(relationType);
						
						Relation tv = new Relation(
								vType, 
								b, 
								v);
						tv.setProperties(properties);
						termino.getRelations().add(tv);
					} else {
						if(b==null)
							LOGGER.warn("Could not build variant because term \"{}\" was not found.", base);
						if(v==null)
							LOGGER.warn("Could not build variant because term \"{}\" was not found.", variant);
					}
						
//					Preconditions.checkNotNull(b, MSG_TERM_DOES_NOT_EXIST, base);
//					Preconditions.checkNotNull(v, MSG_TERM_DOES_NOT_EXIST, variant);
					
				} // end syntactic variations array
			} else if (TERM_OCCURRENCES.equals(fieldname)) {
				tok = jp.nextToken();
				if(tok == JsonToken.START_ARRAY) {
					String tid;
					while ((tok = jp.nextToken()) != JsonToken.END_ARRAY) {
						tid = null;
						begin = -1;
						end = -1;
						fileSource = -1;
						text = null;
						while ((tok = jp.nextToken()) != JsonToken.END_OBJECT) {
							fieldname = jp.getCurrentName();
							if (BEGIN.equals(fieldname)) 
								begin = jp.nextIntValue(-1);
							else if (TEXT.equals(fieldname)) 
								text = jp.nextTextValue();
							else if (END.equals(fieldname)) 
								end = jp.nextIntValue(-1);
							else if (TERM_ID.equals(fieldname)) 
								tid = jp.nextTextValue();
							else if (FILE.equals(fieldname)) {
								fileSource = jp.nextIntValue(-1);
							}
						}
						
						Preconditions.checkArgument(begin != -1, MSG_EXPECT_PROP_FOR_OCCURRENCE, BEGIN);
						Preconditions.checkArgument(end != -1, MSG_EXPECT_PROP_FOR_OCCURRENCE, END);
						Preconditions.checkArgument(fileSource != -1, MSG_EXPECT_PROP_FOR_OCCURRENCE, FILE);
						String documentUrl = inputSources.get(fileSource);
						Preconditions.checkNotNull(documentUrl, MSG_NO_FILE_SOURCE_WITH_ID, fileSource);
						Preconditions.checkNotNull(text, MSG_EXPECT_PROP_FOR_OCCURRENCE, TEXT);
						Term term = termino.getTerms().get(tid);
						Preconditions.checkNotNull(term, MSG_NO_SUCH_TERM_IN_TERMINO, tid);
						indexedCorpus.getOccurrenceStore().addOccurrence(term, documentUrl, begin, end, text);
					} 
				}
				// end occurrences

			}
		}
		jp.close();
		
		if(options.isWithContexts()) {
			/*
			 *  map term ids with terms in context vectors and
			 *  set context vectors
			 */
			List currentTempVecList;
			Term term = null;
			Term coTerm = null;
			ContextVector contextVector;
			for(String groupingKey:contextVectors.keySet()) {
				currentTempVecList = contextVectors.get(groupingKey);
				term = termino.getTerms().get(groupingKey);
				if(!currentTempVecList.isEmpty()) {
					contextVector = new ContextVector(term);
					for(TempVecEntry tempVecEntry:currentTempVecList) {
						coTerm = termino.getTerms().get(tempVecEntry.getTermGroupingKey());
						contextVector.addEntry(coTerm, tempVecEntry.getNbCooccs(), tempVecEntry.getAssocRate());
					}
					term.setContext(contextVector);
				}
			}
		}

		return indexedCorpus;
	}

	@SuppressWarnings("unchecked")
	private static  & Property> Map readProperties(Class pCls, JsonParser jp) throws IOException {
		Map properties = new HashMap<>();
		T property;
		String fieldname;

		Preconditions.checkArgument(jp.nextToken() == JsonToken.START_OBJECT);
		while (jp.nextToken() != JsonToken.END_OBJECT) {
			fieldname = jp.getCurrentName();
			jp.nextToken();
			if(pCls.equals(RelationProperty.class))
				property = (T) RelationProperty.fromJsonString(fieldname);
			else if(pCls.equals(TermProperty.class))
				property = (T) TermProperty.fromJsonString(fieldname);
			else 
				throw new UnsupportedOperationException("Unsupported property class: " + pCls);
			properties.put(property, readPropertyValue(jp, property));
		}
		return properties;
	}

	private static  & Property> Object readPropertyValue(JsonParser jp, 
			T property) throws IOException {
		if(property.getRange().equals(Double.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_NUMBER_FLOAT);
			return jp.getDoubleValue();
		} else if(property.getRange().equals(Float.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_NUMBER_FLOAT);
			return (float)jp.getDoubleValue();
		} else if(property.getRange().equals(Integer.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_NUMBER_INT);
			return jp.getIntValue();
		} else if(property.getRange().equals(Long.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_NUMBER_INT);
			return jp.getLongValue();
		} else if(property.getRange().equals(Boolean.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_FALSE, JsonToken.VALUE_TRUE);
			return jp.getBooleanValue();
		} else if(property.getRange().equals(String.class)) {
			checkToken(property, jp.currentToken(), JsonToken.VALUE_STRING);
			return jp.getValueAsString();
		} else if(Set.class.isAssignableFrom(property.getRange())) {
			checkToken(property, jp.currentToken(), JsonToken.START_ARRAY);
			HashSet