All Downloads are FREE. Search and download functionalities are using the official Maven repository.

opennlp.tools.dictionary.serializer.DictionarySerializer Maven / Gradle / Ivy

There is a newer version: 2.5.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.dictionary.serializer;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;

import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.StringList;
import opennlp.tools.util.model.UncloseableInputStream;

import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.XMLReaderFactory;

/**
  * This class is used by for reading and writing dictionaries of all kinds.
  */
public class DictionarySerializer {

  // TODO: should check for invalid format, make it save
  private static class DictionaryContenthandler implements ContentHandler {

    private EntryInserter mInserter;

//    private boolean mIsInsideDictionaryElement;
//    private boolean mIsInsideEntryElement;
    private boolean mIsInsideTokenElement;
    private boolean mIsCaseSensitiveDictionary;

    private List mTokenList = new LinkedList<>();

    private StringBuilder token = new StringBuilder();

    private Attributes mAttributes;

    private DictionaryContenthandler(EntryInserter inserter) {
      mInserter = inserter;
      mIsCaseSensitiveDictionary = true;
    }
    /**
     * Not implemented.
     */
     public void processingInstruction(String target, String data)
       throws SAXException {
     }

     /**
      * Not implemented.
      */
     public void startDocument() throws SAXException {
     }

     public void startElement(String uri, String localName, String qName,
         org.xml.sax.Attributes atts) throws SAXException {
       if (DICTIONARY_ELEMENT.equals(localName)) {

         mAttributes = new Attributes();

         for (int i = 0; i < atts.getLength(); i++) {
           mAttributes.setValue(atts.getLocalName(i), atts.getValue(i));
         }
         /* get the attribute here ... */
         if (mAttributes.getValue(ATTRIBUTE_CASE_SENSITIVE) != null) {
           mIsCaseSensitiveDictionary = Boolean.valueOf(mAttributes.getValue(ATTRIBUTE_CASE_SENSITIVE));
         }
         mAttributes = null;
       }
       else if (ENTRY_ELEMENT.equals(localName)) {

         mAttributes = new Attributes();

         for (int i = 0; i < atts.getLength(); i++) {
           mAttributes.setValue(atts.getLocalName(i), atts.getValue(i));
         }
       }
       else if (TOKEN_ELEMENT.equals(localName)) {
         mIsInsideTokenElement = true;
       }
     }

     public void characters(char[] ch, int start, int length)
         throws SAXException {
       if (mIsInsideTokenElement) {
         token.append(ch, start, length);
       }
     }

     /**
      * Creates the Profile object after processing is complete
      * and switches mIsInsideNgramElement flag.
      */
     public void endElement(String uri, String localName, String qName)
         throws SAXException {

       if (TOKEN_ELEMENT.equals(localName)) {
         mTokenList.add(token.toString().trim());
         token.setLength(0);
         mIsInsideTokenElement = false;
       }
       else if (ENTRY_ELEMENT.equals(localName)) {

         String[] tokens = mTokenList.toArray(
             new String[mTokenList.size()]);

         Entry entry = new Entry(new StringList(tokens), mAttributes);

         try {
           mInserter.insert(entry);
         } catch (InvalidFormatException e) {
           throw new SAXException("Invalid dictionary format!", e);
         }

         mTokenList.clear();
         mAttributes = null;
       }
     }

     /**
      * Not implemented.
      */
     public void endDocument() throws SAXException {
     }

     /**
      * Not implemented.
      */
     public void endPrefixMapping(String prefix) throws SAXException {
     }

     /**
      * Not implemented.
      */
     public void ignorableWhitespace(char[] ch, int start, int length)
         throws SAXException {
     }

     /**
      * Not implemented.
      */
     public void setDocumentLocator(Locator locator) {
     }

     /**
      * Not implemented.
      */
     public void skippedEntity(String name) throws SAXException {
     }

     /**
      * Not implemented.
      */
     public void startPrefixMapping(String prefix, String uri)
         throws SAXException {
     }
  }

  private static final String CHARSET = "UTF-8";

  private static final String DICTIONARY_ELEMENT = "dictionary";
  private static final String ENTRY_ELEMENT = "entry";
  private static final String TOKEN_ELEMENT = "token";
  private static final String ATTRIBUTE_CASE_SENSITIVE = "case_sensitive";


  /**
   * Creates {@link Entry}s from the given {@link InputStream} and
   * forwards these {@link Entry}s to the {@link EntryInserter}.
   *
   * After creation is finished the provided {@link InputStream} is closed.
   *
   * @param in stream to read entries from
   * @param inserter inserter to forward entries to
   *
   * @return isCaseSensitive attribute for Dictionary
   *
   * @throws IOException
   * @throws InvalidFormatException
   */
  public static boolean create(InputStream in, EntryInserter inserter)
      throws IOException {

    DictionaryContenthandler profileContentHandler =
        new DictionaryContenthandler(inserter);

    XMLReader xmlReader;
    try {
      xmlReader = XMLReaderFactory.createXMLReader();
      xmlReader.setContentHandler(profileContentHandler);
      xmlReader.parse(new InputSource(new UncloseableInputStream(in)));
    }
    catch (SAXException e) {
      throw new InvalidFormatException("The profile data stream has " +
            "an invalid format!", e);
    }
    return profileContentHandler.mIsCaseSensitiveDictionary;
  }

  /**
   * Serializes the given entries to the given {@link OutputStream}.
   *
   * After the serialization is finished the provided
   * {@link OutputStream} remains open.
   *
   * @param out stream to serialize to
   * @param entries entries to serialize
   *
   * @throws IOException If an I/O error occurs
   * @deprecated Use {@link DictionarySerializer#serialize(java.io.OutputStream, java.util.Iterator, boolean)} instead
   */
  @Deprecated
  public static void serialize(OutputStream out, Iterator entries)
          throws IOException {
      DictionarySerializer.serialize(out, entries, true);
  }

  /**
   * Serializes the given entries to the given {@link OutputStream}.
   *
   * After the serialization is finished the provided
   * {@link OutputStream} remains open.
   *
   * @param out stream to serialize to
   * @param entries entries to serialize
   * @param casesensitive indicates if the written dictionary
   *        should be case sensitive or case insensitive.
   *
   * @throws IOException If an I/O error occurs
   */
  public static void serialize(OutputStream out, Iterator entries,
          boolean casesensitive)
      throws IOException {
    StreamResult streamResult = new StreamResult(out);
    SAXTransformerFactory tf = (SAXTransformerFactory)
        SAXTransformerFactory.newInstance();

    TransformerHandler hd;
    try {
      hd = tf.newTransformerHandler();
    } catch (TransformerConfigurationException e) {
      throw new AssertionError("The Transformer configuration must be valid!");
    }

    Transformer serializer = hd.getTransformer();
    serializer.setOutputProperty(OutputKeys.ENCODING, CHARSET);
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");

    hd.setResult(streamResult);


    try {
      hd.startDocument();

      AttributesImpl dictionaryAttributes = new AttributesImpl();

      dictionaryAttributes.addAttribute("", "", ATTRIBUTE_CASE_SENSITIVE,
                "", String.valueOf(casesensitive));
      hd.startElement("", "", DICTIONARY_ELEMENT, dictionaryAttributes);

      while (entries.hasNext()) {
        Entry entry = entries.next();

        serializeEntry(hd, entry);
      }

      hd.endElement("", "", DICTIONARY_ELEMENT);

      hd.endDocument();
    }
    catch (SAXException e) {
      throw new IOException("Error during serialization: " + e.getMessage(), e);
    }
  }

  private static void serializeEntry(TransformerHandler hd, Entry entry)
      throws SAXException{

    AttributesImpl entryAttributes = new AttributesImpl();

    for (Iterator it = entry.getAttributes().iterator(); it.hasNext();) {
      String key = it.next();

      entryAttributes.addAttribute("", "", key,
              "", entry.getAttributes().getValue(key));
    }

    hd.startElement("", "", ENTRY_ELEMENT, entryAttributes);

    StringList tokens = entry.getTokens();

    for (String token1 : tokens) {

      hd.startElement("", "", TOKEN_ELEMENT, new AttributesImpl());

      hd.characters(token1.toCharArray(), 0, token1.length());

      hd.endElement("", "", TOKEN_ELEMENT);
    }

    hd.endElement("", "", ENTRY_ELEMENT);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy