All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.dictionary.serializer.DictionaryEntryPersistor Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package opennlp.tools.dictionary.serializer;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLReaderFactory;

import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.StringList;
import opennlp.tools.util.model.UncloseableInputStream;

/**
 * This class is used by for reading and writing dictionaries of all kinds.
 */
public class DictionaryEntryPersistor {

  // TODO: should check for invalid format, make it save
  private static class DictionaryContenthandler implements ContentHandler {

    private EntryInserter mInserter;

    //    private boolean mIsInsideDictionaryElement;
    //    private boolean mIsInsideEntryElement;
    private boolean mIsInsideTokenElement;
    private boolean mIsCaseSensitiveDictionary;

    private List mTokenList = new LinkedList<>();

    private StringBuilder token = new StringBuilder();

    private Attributes mAttributes;

    private DictionaryContenthandler(EntryInserter inserter) {
      mInserter = inserter;
      mIsCaseSensitiveDictionary = true;
    }

    /**
     * Not implemented.
     */
    public void processingInstruction(String target, String data)
        throws SAXException {
    }

    /**
     * Not implemented.
     */
    public void startDocument() throws SAXException {
    }

    public void startElement(String uri, String localName, String qName,
        org.xml.sax.Attributes atts) throws SAXException {
      if (DICTIONARY_ELEMENT.equals(localName)) {

        mAttributes = new Attributes();

        for (int i = 0; i < atts.getLength(); i++) {
          mAttributes.setValue(atts.getLocalName(i), atts.getValue(i));
        }
        /* get the attribute here ... */
        if (mAttributes.getValue(ATTRIBUTE_CASE_SENSITIVE) != null) {
          mIsCaseSensitiveDictionary = Boolean.valueOf(mAttributes.getValue(ATTRIBUTE_CASE_SENSITIVE));
        }
        mAttributes = null;
      }
      else if (ENTRY_ELEMENT.equals(localName)) {

        mAttributes = new Attributes();

        for (int i = 0; i < atts.getLength(); i++) {
          mAttributes.setValue(atts.getLocalName(i), atts.getValue(i));
        }
      }
      else if (TOKEN_ELEMENT.equals(localName)) {
        mIsInsideTokenElement = true;
      }
    }

    public void characters(char[] ch, int start, int length)
        throws SAXException {
      if (mIsInsideTokenElement) {
        token.append(ch, start, length);
      }
    }

    /**
     * Creates the Profile object after processing is complete
     * and switches mIsInsideNgramElement flag.
     */
    public void endElement(String uri, String localName, String qName)
        throws SAXException {

      if (TOKEN_ELEMENT.equals(localName)) {
        mTokenList.add(token.toString().trim());
        token.setLength(0);
        mIsInsideTokenElement = false;
      }
      else if (ENTRY_ELEMENT.equals(localName)) {

        String[] tokens = mTokenList.toArray(
            new String[mTokenList.size()]);

        Entry entry = new Entry(new StringList(tokens), mAttributes);

        try {
          mInserter.insert(entry);
        } catch (InvalidFormatException e) {
          throw new SAXException("Invalid dictionary format!", e);
        }

        mTokenList.clear();
        mAttributes = null;
      }
    }

    /**
     * Not implemented.
     */
    public void endDocument() throws SAXException {
    }

    /**
     * Not implemented.
     */
    public void endPrefixMapping(String prefix) throws SAXException {
    }

    /**
     * Not implemented.
     */
    public void ignorableWhitespace(char[] ch, int start, int length)
        throws SAXException {
    }

    /**
     * Not implemented.
     */
    public void setDocumentLocator(Locator locator) {
    }

    /**
     * Not implemented.
     */
    public void skippedEntity(String name) throws SAXException {
    }

    /**
     * Not implemented.
     */
    public void startPrefixMapping(String prefix, String uri)
        throws SAXException {
    }
  }

  private static final String CHARSET = "UTF-8";

  private static final String DICTIONARY_ELEMENT = "dictionary";
  private static final String ENTRY_ELEMENT = "entry";
  private static final String TOKEN_ELEMENT = "token";
  private static final String ATTRIBUTE_CASE_SENSITIVE = "case_sensitive";


  /**
   * Creates {@link Entry}s from the given {@link InputStream} and
   * forwards these {@link Entry}s to the {@link EntryInserter}.
   *
   * After creation is finished the provided {@link InputStream} is closed.
   *
   * @param in stream to read entries from
   * @param inserter inserter to forward entries to
   *
   * @return isCaseSensitive attribute for Dictionary
   *
   * @throws IOException
   * @throws InvalidFormatException
   */
  public static boolean create(InputStream in, EntryInserter inserter)
      throws IOException {

    DictionaryContenthandler profileContentHandler =
        new DictionaryContenthandler(inserter);

    XMLReader xmlReader;
    try {
      xmlReader = XMLReaderFactory.createXMLReader();
      xmlReader.setContentHandler(profileContentHandler);
      xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
    }
    catch (SAXException e) {
      throw new InvalidFormatException("The profile data stream has " +
          "an invalid format!", e);
    }
    return profileContentHandler.mIsCaseSensitiveDictionary;
  }

  /**
   * Serializes the given entries to the given {@link OutputStream}.
   *
   * After the serialization is finished the provided
   * {@link OutputStream} remains open.
   *
   * @param out stream to serialize to
   * @param entries entries to serialize
   *
   * @throws IOException If an I/O error occurs
   * @deprecated Use
   *     {@link DictionaryEntryPersistor#serialize(java.io.OutputStream, java.util.Iterator, boolean)} instead
   */
  @Deprecated
  public static void serialize(OutputStream out, Iterator entries)
      throws IOException {
    DictionaryEntryPersistor.serialize(out, entries, true);
  }

  /**
   * Serializes the given entries to the given {@link OutputStream}.
   *
   * After the serialization is finished the provided
   * {@link OutputStream} remains open.
   *
   * @param out stream to serialize to
   * @param entries entries to serialize
   * @param casesensitive indicates if the written dictionary
   *        should be case sensitive or case insensitive.
   *
   * @throws IOException If an I/O error occurs
   */
  public static void serialize(OutputStream out, Iterator entries,
      boolean casesensitive) throws IOException {
    StreamResult streamResult = new StreamResult(out);
    SAXTransformerFactory tf = (SAXTransformerFactory)
        SAXTransformerFactory.newInstance();

    TransformerHandler hd;
    try {
      hd = tf.newTransformerHandler();
    } catch (TransformerConfigurationException e) {
      throw new AssertionError("The Transformer configuration must be valid!");
    }

    Transformer serializer = hd.getTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, CHARSET);
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");

    hd.setResult(streamResult);


    try {
      hd.startDocument();

      AttributesImpl dictionaryAttributes = new AttributesImpl();

      dictionaryAttributes.addAttribute("", "", ATTRIBUTE_CASE_SENSITIVE,
          "", String.valueOf(casesensitive));
      hd.startElement("", "", DICTIONARY_ELEMENT, dictionaryAttributes);

      while (entries.hasNext()) {
        Entry entry = entries.next();

        serializeEntry(hd, entry);
      }

      hd.endElement("", "", DICTIONARY_ELEMENT);

      hd.endDocument();
    }
    catch (SAXException e) {
      throw new IOException("Error during serialization: " + e.getMessage(), e);
    }
  }

  private static void serializeEntry(TransformerHandler hd, Entry entry)
      throws SAXException {

    AttributesImpl entryAttributes = new AttributesImpl();

    for (Iterator it = entry.getAttributes().iterator(); it.hasNext();) {
      String key = it.next();

      entryAttributes.addAttribute("", "", key,
          "", entry.getAttributes().getValue(key));
    }

    hd.startElement("", "", ENTRY_ELEMENT, entryAttributes);

    StringList tokens = entry.getTokens();

    for (String token : tokens) {

      hd.startElement("", "", TOKEN_ELEMENT, new AttributesImpl());

      hd.characters(token.toCharArray(), 0, token.length());

      hd.endElement("", "", TOKEN_ELEMENT);
    }

    hd.endElement("", "", ENTRY_ELEMENT);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy