All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ontotext.kim.model.AliasCacheImpl Maven / Gradle / Ivy

The newest version!
package com.ontotext.kim.model;

import gate.creole.ResourceInstantiationException;
import gate.util.profile.Profiler;
import gnu.trove.TIntHashSet;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.rmi.RemoteException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.collections.Transformer;
import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger;
import org.openrdf.model.URI;
import org.openrdf.model.impl.URIImpl;

import com.ontotext.kim.KIMConstants;
import com.ontotext.kim.client.KIMRuntimeException;
import com.ontotext.kim.client.query.KIMQueryException;
import com.ontotext.kim.client.semanticrepository.QueryResultListener;
import com.ontotext.kim.client.semanticrepository.QueryResultListener.Feed;
import com.ontotext.kim.gate.KimLookupParser;
import com.ontotext.kim.gate.KimLookupParser.AliasLookupDictionary;
import com.ontotext.kim.util.StringTransformations;


/**
 * This class is designed to serve as an Alias dictionary for the
 * KimGazetteer. It is used to store an image of the textual
 * representations (Aliases) of the known objects (Entities) described in the
 * KIM platform knowledge-base. Then it allows to check text fragments for
 * the presence of known Aliases.
* The aliases are not stored as plain text but as couple of hash codes. * The storage is implemented by the couple of classes HashedAlias * and HashRegister. Because of this specific - before storing * of an Alias and also before checking a text fragment - they both must * be pre-processed. The pre-processing is implemented in couple of classes. * The class AliasTextTransformer does preliminary text * normalization and the class ParsingFrame does further * normalization and hash-codes calculation.
*
* An instance of the AliasCacheImpl class is obtained through * a synchronized factory method. * * @author danko * */ public class AliasCacheImpl implements AliasLookupDictionary { protected static Logger log = Logger.getLogger(AliasCacheImpl.class); private static DataFeedFactory feedFactory = new DataFeedFactory(); /** The register containing the HashedAlias instances */ protected HashRegister aliasRegister; /** The set of hash-codes of valid alias prefixes. This set is used in * text parsing and lookup phase. It helps to determine if an attempt * must be made to expand the span of the ParsingFrame used * to parse the searched text. An expansion is made only if currently * framed text generates Alias-Hash-1 code which appears in the * aliasPrefixes set. (For details see * ParsingFrame) */ protected TIntHashSet aliasPrefixes; /** Additional register which allows fast checking if a given Entity's * aliases has been stored in the alias register. */ protected HashRegister aliasInstRegister; /** Array used for encoding/decoding the instance URI's name-spaces */ protected List instNS; /** Array used for encoding/decoding the semantic class URIs */ protected List classCache; /** Additional register containing exactly appointed aliases, which * must be ignored on storing */ protected HashRegister aliasToIgnore; /** The general case sensitivity selector of the Alias cache */ private String caseSensitivity; protected AliasCacheImpl (String caseSensitive) { this.caseSensitivity = caseSensitive; } //========================================================================= // Alias Cache: Instance Generation section //========================================================================= private static Map aliasDictionaries = new HashMap(); private static Object instanceLock = new Object(); public static AliasCacheImpl getInstance() throws ResourceInstantiationException { return getInstance(new File(KIMConstants.KIM_CACHE_PATH), ""); } private static class LoadedCache { public AliasCacheImpl cache; public List clients = new LinkedList(); } /** * A static method for generation/access to the one and only instance * of the alias cache * @return the instance of the cache */ public static AliasCacheImpl getInstance(File dictionaryPath, String clientId) throws ResourceInstantiationException { synchronized(instanceLock) { if ( !aliasDictionaries.containsKey(dictionaryPath)) { LoadedCache lc = new LoadedCache(); lc.cache = createInstance(dictionaryPath); aliasDictionaries.put(dictionaryPath, lc); } } LoadedCache lc = aliasDictionaries.get(dictionaryPath); lc.clients.add(clientId); return lc.cache; } public static void releaseCache(File dictionaryPath, String clientId) { synchronized(instanceLock) { LoadedCache lc = aliasDictionaries.get(dictionaryPath); if (lc == null) return; lc.clients.remove(clientId); if (lc.clients.isEmpty()) aliasDictionaries.remove(dictionaryPath); else { log.info("The cache for " + dictionaryPath + " will not be unloaded or reloaded because some clients remain: " + lc.clients); } } } public static AliasCacheImpl createInstance(File dictionaryPath) throws ResourceInstantiationException { Options opt = Options.load(dictionaryPath); AliasCacheImpl aliasCacheInstance = new AliasCacheImpl(opt.getCaseSensitivity()); Feed feed = feedFactory.createFeed(dictionaryPath, opt); Set ignoreList = Collections.emptySet(); File ignoreListFile = opt.getIgnoreListPath(); if (ignoreListFile != null) { if (ignoreListFile.isFile()) { try { ignoreList = new HashSet(FileUtils.readLines(opt.getIgnoreListPath(), "UTF-8")); log.info(ignoreList.size() + " unique entries loaded from ignore list at " + ignoreListFile.getAbsolutePath()); } catch(IOException e) { log.warn("Could not read " + ignoreListFile.getAbsolutePath(), e); } } else { log.warn("Ignore list at " + ignoreListFile.getAbsolutePath() + " is not present or is not an accessible file."); } } try { aliasCacheInstance.initCache(ignoreList, feed, dictionaryPath, opt.isCacheEnabled()); } catch (RemoteException e) { throw new ResourceInstantiationException(e); } return aliasCacheInstance; } //========================================================================= // Alias Cache Persistence section //========================================================================= /** * Checks whether an alias can be added to the cache, honoring the * caseSensitivity setting. The ignore list check is performed here. * * @return whether the alias can be added */ private boolean verifyAlias(String alias) { if (alias==null || alias.trim().length() == 0) return false; alias = (String) ParsingFrame.frameTT.transform(alias); if (aliasToIgnore.exists(alias.hashCode(), alias)) { log.info("'" + alias + "' ignored, because it was found in the ignore list."); return false; } return true; } /** This class is used in deserialization process to initialize the * aliasInstRegister register */ private static class InstanceRegisterLoader implements HashRegister.ContentProcessor { HashRegister instRegister; public InstanceRegisterLoader(HashRegister register) { instRegister = register; } public void process(Object[] elements) { if (elements != null) { for (int i=0; i * @param ignoreAliases a String list of aliases to be ignored. */ protected void initBlankCache(Collection ignoreAliases) { aliasRegister = new HashRegister(); aliasPrefixes = new TIntHashSet(); aliasInstRegister = new HashRegister(); instNS = new ArrayList(); classCache = new ArrayList(); // Create a TextTransformer instance for Alias text normalization Transformer tt = new AliasTextTransformer( caseSensitivity.equals(Options.INSENSITIVE)); ParsingFrame.frameTT = tt; aliasToIgnore = new HashRegister(); if (ignoreAliases != null) { for (String alias : ignoreAliases) { // Apply same text normalization to the aliases to be ignored alias = (String)tt.transform(alias); aliasToIgnore.add(alias.hashCode(), alias); if (caseSensitivity.equals(Options.ALL_UPPER)) { alias = alias.toUpperCase(); aliasToIgnore.add(alias.hashCode(), alias); } } } log.info( "Aliases in IGNORE list:" + aliasToIgnore.getElementsCount()); } /** This method implements the default full initialization process. * It creates an empty Alias cache and then fills it with data. The data * is collected either from the semantic repository or from a serialization * source (a file). * @param ignoreAliases a String list of aliases to be ignored. * @throws RemoteException on failure to access the semantic repository. */ protected void initCache( Collection ignoreAliases, QueryResultListener.Feed dataFeed, File dictionaryPath, boolean fileCacheEnabled) throws RemoteException { Profiler pro = new Profiler(); pro.enableGCCalling(false); pro.printToSystemOut(true); pro.initRun("Loading of Entities Cache"); pro.checkPoint("start loading"); initBlankCache(ignoreAliases); File fileTCache = new File(dictionaryPath, "kim.trusted.entities.cache").getAbsoluteFile(); // The presence of this flag marks a cache file, which is invalid due to an interruption in loading. File flagTCache = new File(dictionaryPath, fileTCache.getName() + ".flag"); if (fileCacheEnabled) { try { ensureCachePath(dictionaryPath); } catch (IOException e1) { log.error( "Could not create entity cache.", e1); } } boolean flagTLoaded = false; if (fileCacheEnabled && fileTCache.exists() && !flagTCache.exists()) { log.info("Loading of trusted entities from " + fileTCache); flagTLoaded = loadDictionaryFromCacheFile(fileTCache, flagTLoaded); } if (!flagTLoaded) { loadTrustedMaps(dataFeed); if (fileCacheEnabled) { try { flagTCache.createNewFile(); if (fileTCache.exists()) fileTCache.delete(); ObjectOutputStream oos = new ObjectOutputStream( new FileOutputStream(fileTCache)); oos.writeObject( new Object[]{aliasRegister, aliasPrefixes, instNS, classCache}); oos.close(); flagTCache.delete(); } catch (Exception ex) { log.error("Saving of trusted entities to " + fileTCache + " failed.", ex); } } } log.info("Aliases were loaded"); pro.checkPoint("cache loaded"); } @SuppressWarnings("unchecked") private boolean loadDictionaryFromCacheFile(File fileTCache, boolean flagTLoaded) { try { ObjectInputStream ois = new ObjectInputStream( new FileInputStream(fileTCache)); Object[] res = (Object[]) ois.readObject(); ois.close(); aliasRegister = (HashRegister) res[0]; aliasPrefixes = (TIntHashSet) res[1]; instNS = (List) res[2]; classCache = (List) res[3]; aliasInstRegister = new HashRegister(); // The exactly same Entity InstURI strings are reused aliasRegister.processContent( new InstanceRegisterLoader(aliasInstRegister)); flagTLoaded = true; log.info(aliasRegister.getElementsCount() + " elements loaded."); } catch (Exception e) { log.error("Loading from " + fileTCache + " failed. " + "Continue with loading from Semantic Repository.", e); } return flagTLoaded; } private void ensureCachePath(File cachePath) throws IOException { if (cachePath.exists() && !cachePath.isDirectory()) FileUtils.forceDelete(cachePath); if (!cachePath.exists()) FileUtils.forceMkdir(cachePath); } private void loadTrustedMaps(QueryResultListener.Feed dataFeed) { log.info("Loading of trusted entities from Sesame"); String filePath = System.getProperty("kim.home.dir", ".") + EntityPriority.PRIORITY_CONF_FILE.substring(1); existsClassPriority = (new File(filePath)).exists(); if (existsClassPriority) { try { entPrior = new EntityPriority(); entPrior.init(); existsClassPriority = existsClassPriority && entPrior.getFilterLookups(); } catch (Exception e) { log.error( "Cannot create instance of Priorities class", e); entPrior = null; } } EntitiesQueryListener entityListener = new TrustedEntitiesListener(entPrior); // Handler to preserve the same inner listener for the two queries if ( log.isDebugEnabled() ) { entityListener = StatisticListener.wrap(entityListener, "Thrusted Entities"); } try { dataFeed.feedTo(entityListener); } catch (KIMQueryException e) { log.error("Loading failed.", e); throw new KIMRuntimeException("The loading failed.", e); } finally { log.info("The loading from Sesame finished"); } } /** A class extending the EntitiesQueryListener, which is * used to process the input from the semantic repository. It is used * only when the data is loaded from there. */ class TrustedEntitiesListener extends EntitiesQueryListener { private final EntityPriority m_entPrior; TrustedEntitiesListener(EntityPriority m_entPrior) { this.m_entPrior = m_entPrior; } @Override protected void addEntity(String instUri, String classUri, String aliasLabel) { addAlias(instUri, classUri, aliasLabel, true); } @Override public void endTableQueryResult() throws IOException { super.endTableQueryResult(); if (existsClassPriority && allPrioritiesCompetition != null) { Iterator it = allPrioritiesCompetition.keySet().iterator(); while (it.hasNext()) { List pcList = allPrioritiesCompetition.get(it.next()); int maxPrior = pcList.get(0).maxPriority; int treshold = m_entPrior.getThreshold(); for (int i = 0; i < pcList.size(); i++) { priorityCompetition pc = pcList.get(i); if (i == 0 || maxPrior - pc.maxPriority <= treshold) addAlias(pc.instURI, pc.classURI, pc.alias, false); } } allPrioritiesCompetition = null; } } } //========================================================================= // Alias Cache: Statistics Collection section //========================================================================= /** The class is used to collect timing data for profiling purposes */ public static class Stats { private static boolean doStats = false; private static final String[] statNames = { "AA_PrefixStore", "AA_URIStringReuse", "AA_InstUriRegisterInsert", "AA_AliasRegisterInsert", "AL_PreParsing", "AL_GetByAliasHash1", "AL_FilterByAliasHash2", "PF_MakeFrame", "PF_Find", "PF_RecalcFrame", "PF_MakeFrameSnapshot" }; private static long[] statTimes = new long[statNames.length]; public static void restartStats() { doStats = true; Arrays.fill(statTimes, 0); } public static void stopStats() { doStats = false; } public static boolean doStats() { return doStats; } private static long curr; private static long last; public static void markIt(int index) { if (doStats) { curr = System.currentTimeMillis(); long duration = curr-last; if (index >= 0 && index < statTimes.length) statTimes[index] += duration; last = curr; } } public static void dumpStats() { for (int i=0; i 0) System.out.println( " " + statNames[i] + " = " + statTimes[i] + "ms."); } } //========================================================================= // Alias Cache: Population section //========================================================================= /** Adds an Alias with its instance and semantic class to the Alias cache. * A single call to this method could result in adding several records to * the alias cache. This is as a result to the standard Alias enrichment * logic which is applied over the given as input alias string. * @param instURI the URI of the Entity instance corresponding to the * Alias * @param classURI the URI of the semantic class * @param alias the string of the alias * @param primaryAccess processing specific flag; if true - forces * class priority checks. */ public void addAlias(String instURI, String classURI, String alias, boolean primaryAccess) { if (checkClassPriority(instURI, classURI, alias, primaryAccess)) return; String[] enriched = aliasEnrichment(alias); for (int i=0; i< enriched.length; i++) { if (verifyAlias(enriched[i])) simpleAddAlias(instURI, classURI, enriched[i]); } } /** This method performs the standard alias enrichment. It covers * cases as variants with and without trailing punctuation. * @param alias the original alias string. * @return array of distinct strings which are accepted as equally valid * representation of the related to the Alias - Entity. */ private String[] aliasEnrichment(String alias) { HashSet aliases = new HashSet(); aliases.add(alias); String[] tmp; // Enrich with UPPER case versions if needed if (caseSensitivity.equals(Options.ALL_UPPER)) { tmp = aliases.toArray(new String[0]); for (int i=0; i lookup(String alias) { ParsingFrame pfm = new ParsingFrame(alias); pfm.parseAll(); return lookup(pfm, true); } public Collection lookup(ParsingFrame pfm) { return lookup(pfm, false); } private List lookup( ParsingFrame pfm, boolean exactlySame) { Stats.markIt(-1); List res = new ArrayList(); Object[] tmp = aliasRegister.get(pfm.getAliasHash1()); Stats.markIt(5); if (tmp == null || tmp.length==0) return res; for (int i=0; i> allPrioritiesCompetition = new HashMap>(); protected EntityPriority entPrior; protected boolean existsClassPriority = false; /** * If current label class is 'competitive', the label is put in a Map, * having List * for all concurrent classes descending sorted by weight * because of small number of classes for a label, direct insert is * chosen in comparison with binary */ private boolean checkClassPriority(String instURI, String classURI, final String alias, boolean primaryAccess) { boolean rejectedByPriority = false; if (primaryAccess && existsClassPriority) { URI origClass = new URIImpl(classURI); String priorityClassName = origClass.getLocalName(); rejectedByPriority = entPrior.m_hClassPrio.containsKey(priorityClassName); if (rejectedByPriority) { int mp = entPrior.m_hClassPrio.get(priorityClassName); log.info("COMPETITION:" + "\t" + instURI + "\t" + classURI + "\t" + alias + "\t" + mp); if (!allPrioritiesCompetition.containsKey(alias)) allPrioritiesCompetition.put(alias, new ArrayList()); List pcList = allPrioritiesCompetition.get(alias); boolean foundLesser = false; for (int i = 0; i < pcList.size(); i++) { if (mp > pcList.get(i).maxPriority) { pcList.add(i, new priorityCompetition( instURI, classURI, alias, mp)); foundLesser = true; break; } } if (!foundLesser) pcList.add( new priorityCompetition( instURI, classURI, alias, mp)); } } return rejectedByPriority; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy