All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.ontotext.gate.gazetteer.HashGazetteer Maven / Gradle / Ivy

Go to download

ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.

There is a newer version: 9.1
Show newest version
/**
 * (c) Copyright Ontotext Lab, Sirma Group Corp 2004
 */
package com.ontotext.gate.gazetteer;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.gazetteer.AbstractGazetteer;
import gate.creole.gazetteer.GazetteerException;
import gate.creole.gazetteer.GazetteerList;
import gate.creole.gazetteer.GazetteerNode;
import gate.creole.gazetteer.LinearDefinition;
import gate.creole.gazetteer.LinearNode;
import gate.creole.gazetteer.Lookup;
import gate.creole.gazetteer.MappingNode;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.HiddenCreoleParameter;
import gate.creole.metadata.Sharable;
import gate.util.InvalidOffsetException;
import gate.util.GateRuntimeException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

@CreoleResource(name = "Hash Gazetteer", icon = "gazetteer", comment = "A list lookup component implemented by OntoText Lab. The licence information is also available in licence.ontotext.html in the lib folder of GATE", helpURL = "http://www.ontotext.com/downloads/index.html#gazetteer")
public class HashGazetteer extends AbstractGazetteer {
  private static final long serialVersionUID = -4603155688378104052L;

  private ArrayList categoryList;

  private Map listsByNode;

  private Map> mapsList[];

  private AnnotationSet annotationSet = null;

  @Override
  @SuppressWarnings({"unchecked", "rawtypes"})
  public Resource init() throws ResourceInstantiationException {
    if(mapsList != null) {
      // this is a duplicate - nothing to do
    } else {
      if(listsURL == null)
        throw new ResourceInstantiationException(
            "No URL provided for gazetteer creation!");
      try {
        mapsList = new HashMap[1000];
        definition = new LinearDefinition();
        try {
          definition.setURL(listsURL.toURL());
        }
        catch (IOException e) {
          throw new ResourceInstantiationException(e);
        }        
        definition.load();
        int i = definition.size();
        listsByNode = definition.loadLists();
        categoryList = new ArrayList(i + 1);
        Iterator iterator = definition.iterator();
        int j = 0;
        LinearNode linearnode;
        for(; iterator.hasNext(); readList(linearnode)) {
          linearnode = iterator.next();
          fireStatusChanged("Reading " + linearnode.toString());
          fireProgressChanged((++j * 100) / i);
        }
        fireProcessFinished();
      }
      catch(GazetteerException exception) {
        throw new ResourceInstantiationException(exception);
      }
    }
    return this;
  }

  /**
   * Re-initialize this gazetteer by re-loading the configuration.
   */
  @Override
  public void reInit() throws ResourceInstantiationException {
    mapsList = null;
    categoryList = null;
    init();
  }

  @Override
  public void execute() throws ExecutionException {
    if(document == null) throw new ExecutionException("Document is null!");
    annotationSet = document.getAnnotations(annotationSetName);
    String s = document.getContent().toString() + " ";
    if(!super.caseSensitive.booleanValue()) {
      s = s.toUpperCase();
    }
    int documentLength = s.length();
    int j = 0;
    int k = 0;
    StringBuffer stringbuffer = new StringBuffer();
    boolean prevIsSymbol = false;
    boolean prevIsDigit = false;
    boolean prevIsLetter = false;
    // TODO what does this do, as it is only ever set to false
    boolean flag11 = false;
    String s3 = "";
    int i1 = 0;
    int j1 = 0;
    for(int position = 0; position < documentLength; position++) {
      char c = s.charAt(position);
      boolean currIsWhitespace = Character.isWhitespace(c);
      if(currIsWhitespace && stringbuffer.length() == 0) {
        j++;
        prevIsLetter = prevIsDigit = prevIsSymbol = flag11 = false;
        continue;
      }
      if(currIsWhitespace && prevIsSymbol && stringbuffer.length() == 1) {
        j += 2;
        prevIsLetter = prevIsDigit = prevIsSymbol = flag11 = false;
        stringbuffer.delete(0, stringbuffer.length());
        continue;
      }
      boolean currIsLetter = Character.isLetter(c);
      boolean currIsDigit = Character.isDigit(c);
      boolean currIsSymbol = !currIsWhitespace && !currIsLetter && !currIsDigit;
      boolean currIsLowerCase = Character.isLowerCase(c);
      if(k <= j
          && (currIsWhitespace || currIsSymbol || flag11 && !currIsLowerCase || !prevIsLetter
              && currIsLetter)) k = position;
      boolean flag13 =
          prevIsLetter && (currIsDigit || currIsSymbol || currIsWhitespace)
              || prevIsLetter && currIsLetter && flag11 && !currIsLowerCase
              || prevIsDigit
              && (currIsLetter || currIsSymbol || currIsWhitespace)
              || prevIsSymbol;
      if(position == documentLength - 1) flag13 = true;
      if(flag13) {
        boolean flag16 = !currIsSymbol && !currIsDigit;
        if(position == documentLength - 1) flag16 = true;
        String word = normalizeWhitespace(stringbuffer.toString());
        int k1 = word.length();
        flag16 &= k1 - j1 > 1;
        j1 = k1;
        if(i1 != j || !word.equals(s3)) {
          int wordLength = word.length();
          if(wordLength > 0) {
            boolean flag14 = annotate(word, j, position, wordLength);
            if(flag14) {
              s3 = word;
              i1 = j;
            }
            if(!flag14 && flag16 || documentLength - 1 == position) {
              if(k <= j) k = position;
              j = k;
              position = k - 1;
              stringbuffer.delete(0, stringbuffer.length());
              continue;
            }
          }
        }
      }
      stringbuffer.append(c);
      prevIsDigit = currIsDigit;
      prevIsLetter = currIsLetter;
      prevIsSymbol = currIsSymbol;
    }
    fireProcessFinished();
    fireStatusChanged("Hash Gazetteer processing finished!");
  }

  @Override
  public boolean add(String word, Lookup lookup1) {
    if(!super.caseSensitive.booleanValue()) {
      word = word.toUpperCase();
    }
    String s2 = removeTrailingSymbols(word);
    if(!s2.equals(word)) add(s2, lookup1);
    String s3 = word + " ";
    List arraylist = null;
    int j = s3.length();
    boolean prevIsLetter = false;
    boolean prevIsDigit = false;
    boolean prevIsLowercase = false;
    String s4 = "";
    Map> hashmap = null;
    for(int k = 0; k < j; k++) {
      char c = s3.charAt(k);
      boolean currIsWhitespace = Character.isWhitespace(c);
      boolean currIsDigit = Character.isDigit(c);
      boolean currIsLetter = Character.isLetter(c);
      boolean currIsSymbol = !currIsWhitespace && !currIsDigit && !currIsLetter;
      boolean currIsLowercase = Character.isLowerCase(c);
      boolean flag18 =
          prevIsLetter && (currIsDigit || currIsSymbol || currIsWhitespace)
              || prevIsLetter && currIsLetter && prevIsLowercase
              && !currIsLowercase || prevIsDigit
              && (currIsLetter || currIsSymbol || currIsWhitespace);
      // if we are on the last character
      if(k + 1 == j) flag18 = true;
      if(flag18) {
        s4 = normalizeWhitespace(s3.substring(0, k));
        int i = s4.length();
        if(mapsList[i] == null) {
          hashmap = new HashMap>();
          mapsList[i] = hashmap;
        } else {
          hashmap = mapsList[i];
        }
        if(!hashmap.containsKey(s4)) hashmap.put(s4, null);
      }
      prevIsDigit = currIsDigit;
      prevIsLetter = currIsLetter;
      prevIsLowercase = currIsLowercase;
    }
    arraylist = hashmap.get(s4);
    if(null == arraylist) {
      arraylist = new ArrayList(1);
      arraylist.add(lookup1);
    } else if(!arraylist.contains(lookup1)) arraylist.add(lookup1);
    hashmap.put(s4, arraylist);
    return true;
  }

  @Override
  public Set lookup(String s) {
    Set set = null;
    String s1 = normalizeWhitespace(s);
    int i = s1.length();
    if(mapsList.length < i) return set;
    Map> hashmap = mapsList[i];
    if(hashmap == null) {
      return set;
    } else {
      Set hashset = new HashSet(hashmap.get(s1));
      return hashset;
    }
  }

  private boolean annotate(String word, int i, int documentPosition,
      int wordLength) {
    if(wordLength >= mapsList.length) return false;
    Map> hashmap = mapsList[wordLength];
    if(hashmap == null) return false;
    if(!hashmap.containsKey(word)) return false;
    List arraylist = hashmap.get(word);
    // TODO shouldn't this return false if arraylist is null?
    if(null != arraylist) {
      for(Iterator iterator = arraylist.iterator(); iterator.hasNext();) {
        Lookup lookup1 = iterator.next();
        FeatureMap featuremap = Factory.newFeatureMap();
        featuremap.put("majorType", lookup1.majorType);
        if(null != lookup1.oClass && null != lookup1.ontology) {
          featuremap.put("class", lookup1.oClass);
          featuremap.put("ontology", lookup1.ontology);
        }
        if(null != lookup1.minorType) {
          featuremap.put("minorType", lookup1.minorType);
          if(null != lookup1.languages)
            featuremap.put("language", lookup1.languages);
        }
        try {
          annotationSet.add(Long.valueOf(i), Long.valueOf(documentPosition), "Lookup",
              featuremap);
        } catch(InvalidOffsetException invalidoffsetexception) {
          throw new GateRuntimeException(invalidoffsetexception.toString());
        }
      }
    }
    return true;
  }

  /**
   * Removes a string from the gazetteer
   *
   * @param s
   *          the item to remove
   * @return true if the operation was successful
   */
  @Override
  public boolean remove(String s) {
    String s1 = a(s);
    int i = s1.length();
    if(i > mapsList.length) return false;
    Map> hashmap = mapsList[i];
    if(hashmap == null) return false;
    if(hashmap.containsKey(s1)) {
      hashmap.remove(s1);
      return true;
    }
    return false;
  }

  /**
   * Works backwards through the String parameter removing each character until
   * it encounters a letter, digit, or whitespace at which point it returns the
   * truncated string.
   *
   * @param s
   *          the String you wish to remove trailing symbols from
   * @return the truncated String that now ends in a letter, digit, or
   *         whitespace character
   */
  private String removeTrailingSymbols(String s) {
    for(int i = s.length() - 1; i >= 0; i--) {
      char c = s.charAt(i);
      if(!Character.isLetter(c) && !Character.isDigit(c)
          && !Character.isWhitespace(c))
        s = s.substring(0, i);
      else return s;
    }
    return s;
  }

  /**
   * Normalizes the whitespace within the String instance by replacing any
   * sequence of one or more whitespace characters with a single space. Not that
   * any leading/trailing whitespace is also removed.
   *
   * @param s
   *          the String to normalize
   * @return the normalized String
   */
  private String normalizeWhitespace(String s) {
    // this seems to be the same as String.replaceAll("\\s+", " ")
    StringBuffer stringbuffer = new StringBuffer();
    s = s.trim();
    char ac[] = s.toCharArray();
    int i = s.length();
    boolean prevWasWhitespace = false;
    for(int j = 0; j < i; j++) {
      char c = ac[j];
      boolean currIsWhitespace = Character.isWhitespace(c);
      if(currIsWhitespace && !prevWasWhitespace)
        stringbuffer.append(' ');
      else if(!currIsWhitespace) stringbuffer.append(c);
      prevWasWhitespace = currIsWhitespace;
    }
    return stringbuffer.toString();
  }

  private String a(String s) {
    StringBuffer stringbuffer = new StringBuffer();
    boolean allLettersUppercase = true;
    s = s.trim();
    char ac[] = s.toCharArray();
    int i = s.length();
    if(i <= 1) return s;
    char firstCharacter = ac[0];
    stringbuffer.append(firstCharacter);
    boolean flag2 = true;
    boolean prevIsLetter = Character.isLetter(firstCharacter);
    boolean prevNotLetterOrDigit = !Character.isLetterOrDigit(firstCharacter);
    boolean flag10 = true;
    char c2 = 'p';
    for(int j = 1; j < i; j++) {
      char currentCharacter = ac[j];
      boolean currNotLetterOrDigit =
          !Character.isLetterOrDigit(currentCharacter);
      boolean currIsWhitespace = Character.isWhitespace(currentCharacter);
      boolean currIsLetter = Character.isLetter(currentCharacter);
      boolean currIsDigit = Character.isDigit(currentCharacter);
      if(flag2) {
        if(prevNotLetterOrDigit && currIsWhitespace) continue;
        flag2 =
            prevIsLetter && currNotLetterOrDigit || prevNotLetterOrDigit
                && currIsLetter;
        if(currNotLetterOrDigit) {
          if(c2 == 'p') c2 = currentCharacter;
          flag2 = flag10 = c2 == currentCharacter;
        }
        if(j > 2 && !flag2 && stringbuffer.length() > 0) {
          char c3 = stringbuffer.charAt(stringbuffer.length() - 1);
          stringbuffer.deleteCharAt(stringbuffer.length() - 1);
          stringbuffer.append(Character.toLowerCase(c3));
        }
      }
      if(currIsLetter || currIsDigit) {
        if(currIsLetter)
          allLettersUppercase &= Character.isUpperCase(currentCharacter);
        if(!flag10) currentCharacter = Character.toLowerCase(currentCharacter);
        stringbuffer.append(currentCharacter);
      } else if(!flag2) flag10 = false;
      prevIsLetter = currIsLetter;
      prevNotLetterOrDigit = currNotLetterOrDigit;
    }
    String s1 = stringbuffer.toString();
    if(allLettersUppercase) s1 = s1.toUpperCase();
    return s1;
  }

  private void readList(LinearNode linearnode) throws GazetteerException {
    if(linearnode == null)
      throw new GazetteerException("LinearNode node is null");
    GazetteerList gazetteerlist = listsByNode.get(linearnode);
    if(gazetteerlist == null)
      throw new GazetteerException("gazetteer list not found by node");
    String s = linearnode.getList();
    String majorType = linearnode.getMajorType();
    String minorType = linearnode.getMinorType();
    String language = linearnode.getLanguage();
    Lookup lookup1 = new Lookup(s, majorType, minorType, language);
    if(mappingDefinition != null) {
      MappingNode mappingnode = mappingDefinition.getNodeByList(s);
      if(null != mappingnode) {
        lookup1.oClass = mappingnode.getClassID();
        lookup1.ontology = mappingnode.getOntologyID();
      }
    }
    lookup1.list = s;
    categoryList.add(lookup1);
    Iterator iterator = gazetteerlist.iterator();
    String normalisedWord = null;
    for(; iterator.hasNext(); add(normalisedWord, lookup1)) {
      String word = iterator.next().toString();
      int wordLength = word.length();
      for(int j = 0; j < wordLength; j++) {
        if(j + 1 != wordLength && !Character.isWhitespace(word.charAt(j)))
          continue;
        if(j + 1 == wordLength) j = wordLength;
        normalisedWord = word.substring(0, j).trim();
      }
    }
  }

  @Override
  @HiddenCreoleParameter
  public void setWholeWordsOnly(Boolean wholeWordsOnly) {
    super.setWholeWordsOnly(wholeWordsOnly);
  }

  @Override
  @HiddenCreoleParameter
  public void setLongestMatchOnly(Boolean longestMatchOnly) {
    super.setLongestMatchOnly(longestMatchOnly);
  }
  
  @Sharable
  public void setMapsList(Map> mapsList[]) {
    this.mapsList = mapsList;
  }

  public Map>[] getMapsList() {
    return mapsList;
  }

  @Sharable
  public void setCategoryList(ArrayList categoryList) {
    this.categoryList = categoryList;
  }

  public ArrayList getCategoryList() {
    return categoryList;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy