All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.formats.NameFinderCensus90NameStream Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.formats;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Locale;

import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;

/**
 * This class helps to read the US Census data from the files to build a
 * StringList for each dictionary entry in the name-finder dictionary.
 * The entries in the source file are as follows:
 * 

* SMITH 1.006 1.006 1 *

    *
  • The first field is the name (in ALL CAPS). *
  • The next field is a frequency in percent. *
  • The next is a cumulative frequency in percent. *
  • The last is a ranking. *
*

* Note: Do not use this class, internal use only! */ public class NameFinderCensus90NameStream implements ObjectStream { private final Locale locale; private final Charset encoding; private final ObjectStream lineStream; /** * This constructor takes an ObjectStream and initializes the class to handle * the stream. * * @param lineStream an ObjectSteam<String> that represents the * input file to be attached to this class. */ public NameFinderCensus90NameStream(ObjectStream lineStream) { this.locale = new Locale("en"); // locale is English this.encoding = Charset.defaultCharset(); // todo how do we find the encoding for an already open ObjectStream() ? this.lineStream = lineStream; } /** * This constructor takes an InputStream and a Charset * and opens an associated stream object with the specified encoding specified. * * @param in an InputStreamFactory for the input file. * @param encoding the Charset to apply to the input stream. * @throws IOException */ public NameFinderCensus90NameStream(InputStreamFactory in, Charset encoding) throws IOException { this.locale = new Locale("en"); // locale is English this.encoding = encoding; this.lineStream = new PlainTextByLineStream(in, this.encoding); } public StringList read() throws IOException { String line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { String name2; // find the location of the name separator in the line of data. int pos = line.indexOf(' '); if ((pos != -1)) { String parsed = line.substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.length() > 2) && (parsed.startsWith("MC"))) { name2 = parsed.substring(0,1).toUpperCase(locale) + parsed.substring(1,2).toLowerCase(locale) + parsed.substring(2,3).toUpperCase(locale) + parsed.substring(3).toLowerCase(locale); } else { name2 = parsed.substring(0,1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale); } name = new StringList(new String[]{name2}); } } return name; } public void reset() throws IOException, UnsupportedOperationException { lineStream.reset(); } public void close() throws IOException { lineStream.close(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy