All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.coref.sim.GenderModel Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.coref.sim;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import opennlp.maxent.GIS;
import opennlp.maxent.io.SuffixSensitiveGISModelReader;
import opennlp.maxent.io.SuffixSensitiveGISModelWriter;
import opennlp.model.Event;
import opennlp.model.MaxentModel;
import opennlp.tools.coref.resolver.ResolverUtils;
import opennlp.tools.util.CollectionEventStream;
import opennlp.tools.util.HashList;

/**
 * Class which models the gender of a particular mentions and entities made up of mentions.
 */
public class GenderModel implements TestGenderModel, TrainSimilarityModel {

  private int maleIndex;
  private int femaleIndex;
  private int neuterIndex;

  private String modelName;
  private String modelExtension = ".bin.gz";
  private MaxentModel testModel;
  private List events;
  private boolean debugOn = true;

  private Set maleNames;
  private Set femaleNames;

  public static TestGenderModel testModel(String name) throws IOException {
    GenderModel gm = new GenderModel(name, false);
    return gm;
  }

  public static TrainSimilarityModel trainModel(String name) throws IOException {
    GenderModel gm = new GenderModel(name, true);
    return gm;
  }

  private Set readNames(String nameFile) throws IOException {
    Set names = new HashSet();
    BufferedReader nameReader = new BufferedReader(new FileReader(nameFile));
    for (String line = nameReader.readLine(); line != null; line = nameReader.readLine()) {
      names.add(line);
    }
    return names;
  }

  private GenderModel(String modelName, boolean train) throws IOException {
    this.modelName = modelName;
    maleNames = readNames(modelName+".mas");
    femaleNames = readNames(modelName+".fem");
    if (train) {
      events = new ArrayList();
    }
    else {
      //if (MaxentResolver.loadAsResource()) {
      //  testModel = (new BinaryGISModelReader(new DataInputStream(this.getClass().getResourceAsStream(modelName)))).getModel();
      //}
      testModel = (new SuffixSensitiveGISModelReader(new File(modelName+modelExtension))).getModel();
      maleIndex = testModel.getIndex(GenderEnum.MALE.toString());
      femaleIndex = testModel.getIndex(GenderEnum.FEMALE.toString());
      neuterIndex = testModel.getIndex(GenderEnum.NEUTER.toString());
    }
  }

  private List getFeatures(Context np1) {
    List features = new ArrayList();
    features.add("default");
    for (int ti = 0, tl = np1.getHeadTokenIndex(); ti < tl; ti++) {
      features.add("mw=" + np1.getTokens()[ti].toString());
    }
    features.add("hw=" + np1.getHeadTokenText());
    features.add("n="+np1.getNameType());
    if (np1.getNameType() != null && np1.getNameType().equals("person")) {
      Object[] tokens = np1.getTokens();
      //System.err.println("GenderModel.getFeatures: person name="+np1);
      for (int ti=0;ti entity) {
    for (Iterator ci = entity.iterator(); ci.hasNext();) {
      Context ec = ci.next();
      GenderEnum ge = getGender(ec);
      if (ge != GenderEnum.UNKNOWN) {
        return ge;
      }
    }

    return GenderEnum.UNKNOWN;
  }

  @SuppressWarnings("unchecked")
  public void setExtents(Context[] extentContexts) {
    HashList entities = new HashList();
    List singletons = new ArrayList();
    for (int ei = 0, el = extentContexts.length; ei < el; ei++) {
      Context ec = extentContexts[ei];
      //System.err.println("GenderModel.setExtents: ec("+ec.getId()+") "+ec.toText());
      if (ec.getId() != -1) {
        entities.put(ec.getId(), ec);
      }
      else {
        singletons.add(ec);
      }
    }
    List males = new ArrayList();
    List females = new ArrayList();
    List eunuches = new ArrayList();
    //coref entities
    for (Iterator ei = entities.keySet().iterator(); ei.hasNext();) {
      Integer key = ei.next();
      List entityContexts = (List) entities.get(key);
      GenderEnum gender = getGender(entityContexts);
      if (gender != null) {
        if (gender == GenderEnum.MALE) {
          males.addAll(entityContexts);
        }
        else if (gender == GenderEnum.FEMALE) {
          females.addAll(entityContexts);
        }
        else if (gender == GenderEnum.NEUTER) {
          eunuches.addAll(entityContexts);
        }
      }
    }
    //non-coref entities
    for (Iterator ei = singletons.iterator(); ei.hasNext();) {
      Context ec = ei.next();
      GenderEnum gender = getGender(ec);
      if (gender == GenderEnum.MALE) {
        males.add(ec);
      }
      else if (gender == GenderEnum.FEMALE) {
        females.add(ec);
      }
      else if (gender == GenderEnum.NEUTER) {
        eunuches.add(ec);
      }
    }
    for (Iterator mi = males.iterator(); mi.hasNext();) {
      Context ec = mi.next();
      addEvent(GenderEnum.MALE.toString(), ec);
    }
    for (Iterator fi = females.iterator(); fi.hasNext();) {
      Context ec = fi.next();
      addEvent(GenderEnum.FEMALE.toString(), ec);
    }
    for (Iterator ei = eunuches.iterator(); ei.hasNext();) {
      Context ec = ei.next();
      addEvent(GenderEnum.NEUTER.toString(), ec);
    }
  }

  public static void main(String[] args) throws IOException {
    if (args.length == 0) {
      System.err.println("Usage: GenderModel modelName < tiger/NN bear/NN");
      System.exit(1);
    }
    String modelName = args[0];
    GenderModel model = new GenderModel(modelName, false);
    //Context.wn = new WordNet(System.getProperty("WNHOME"), true);
    //Context.morphy = new Morphy(Context.wn);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    for (String line = in.readLine(); line != null; line = in.readLine()) {
      String[] words = line.split(" ");
      double[] dist = model.genderDistribution(Context.parseContext(words[0]));
      System.out.println("m="+dist[model.getMaleIndex()] + " f=" +dist[model.getFemaleIndex()]+" n="+dist[model.getNeuterIndex()]+" "+model.getFeatures(Context.parseContext(words[0])));
    }
  }

  public double[] genderDistribution(Context np1) {
    List features = getFeatures(np1);
    if (debugOn) {
      //System.err.println("GenderModel.genderDistribution: "+features);
    }
    return testModel.eval(features.toArray(new String[features.size()]));
  }

  public void trainModel() throws IOException {
    if (debugOn) {
      FileWriter writer = new FileWriter(modelName+".events");
      for (Iterator ei=events.iterator();ei.hasNext();) {
        Event e = ei.next();
        writer.write(e.toString()+"\n");
      }
      writer.close();
    }
    new SuffixSensitiveGISModelWriter(
        GIS.trainModel(
        new CollectionEventStream(events), true),
        new File(modelName+modelExtension)).persist();
  }

  public int getFemaleIndex() {
    return femaleIndex;
  }

  public int getMaleIndex() {
    return maleIndex;
  }

  public int getNeuterIndex() {
    return neuterIndex;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy