All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.sgml.Sgml2Xml Maven / Gradle / Ivy

/*
 *  Sgml2Xml.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Cristian URSU,  4/July/2000
 *
 *  $Id: Sgml2Xml.java 19660 2016-10-10 07:57:55Z markagreenwood $
 */

package gate.sgml;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.*;

import gate.Document;
import gate.util.Files;


/**
  * Not so fast...
  * This class is not a realy Sgml2Xml convertor.
  * It takes an SGML document and tries to prepare it for an XML parser
  * For a true conversion we need an Java SGML parser...
  * If you know one let me know....
  *
  * What does it do:
  * 
    *
  • If it finds something like this : <element attribute = value> * it will produce: <element attribute = "value"> *
  • If it finds something like this : <element something * attribute2=value>it will produce : <element * defaultAttribute="something" attribute2="value"> *
  • If it finds : <element att1='value1 value2' att2="value2 * value3"> it will produce: <element att1="value1 value2" * att2="value2 value3"> *
  • If it finds : <element1> <elem>text </element1> * will produce: <element1> <elem>text<elem> * </element1> *
  • If it find : <element1> <elem>[white spaces] * </element1>, * it will produce:<element1> <elem/>[white spaces]< * /element1> *
* What doesn't: *
    *
  • Doesn't expand the entities. So the entities from the SGML document * must be resolved by the XML parser *
  • Doesn't replace internal entities with their corresponding value *
*/ public class Sgml2Xml{ /** * The constructor initialises some member fields * @param SgmlDoc the content of the Sgml document that will be modified */ public Sgml2Xml(String SgmlDoc){ // create a new modifier m_modifier = new StringBuffer(SgmlDoc); // create a new dobiousElements list // se the explanatin at the end of the class dubiousElements = new ArrayList(); stack = new Stack(); } /** * The other constructor * @param doc The Gate document that will be transformed to XML */ public Sgml2Xml(Document doc){ // set as a member m_doc = doc; // create a new modifier m_modifier = new StringBuffer(m_doc.getContent().toString()); // create a new dobiousElements list // se the explanatin at the end of the class dubiousElements = new ArrayList(); stack = new Stack(); } /* I keep this just in case I need some more debuging public static void main(String[] args){ Sgml2Xml convertor = new Sgml2Xml(" 0 if (charPos > 0){ // this is not an empty element because there is text that follows // set the element from the top of the stack to be a non empty one o.setClosePos(charPos); o.setEmpty(false); // reset the charPos charPos = 0; }//if (charPos > 0) }//if (!stack.isEmpty()) }//if ('<' == m_currChar) // if currChar is not whiteSpace then save the position of the last // char that was read if (('<' != currChar) && !isWhiteSpace(currChar)) charPos = m_cursor; }//doState1 /** We came from state 1 and just read '<' If currChar == '/' -> state 11 If is a char != white spaces -> state 3 stay in state 2 while there are only white spaces */ private void doState2(char currChar){ if ('/' == currChar){ // go to state 11 m_currState = 11; } // if currChar is a char != white spaces then go to state 3 if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){ // save the position where starts the element's name // we need that in order to be able to read the current tag name // this name it will be read from m_modifier using the substring() method elemNameStart = m_cursor -1; // go to state 3 m_currState = 3; } }// doState2 /** * Just read the first char from the element's name and now analize the next * char. * If '>' the elem name was a single char -> state 1 * IF is WhiteSpaces -> state 4 * Otherwise stay in state 3 and read the elemnt's name */ private void doState3(char currChar){ if ( '>' == currChar ){ // save the pos where the element's name ends elemNameEnd = m_cursor - 1; // this is also the pos where to insert '/' for empty elements. // In this case we have this situation sau < w> closePos = m_cursor - 1; // get the name of the element elemName = m_modifier.substring(elemNameStart,elemNameEnd); // we put the element into stack // we think in this point that the element is empty... performFinalAction(elemName, closePos); // go to state 1 m_currState = 1; } if (isWhiteSpace(currChar)){ // go to state 4 m_currState = 4; // save the pos where the element's name ends elemNameEnd = m_cursor - 1; // get the name of the element elemName = m_modifier.substring(elemNameStart,elemNameEnd); } }// doState3 /** * We read the name of the element and we prepare for '>' or attributes * '>' -> state 1 * any char !- white space -> state 5 */ private void doState4(char currChar){ if ( '>' == currChar ){ // this is also the pos where to insert '/' for empty elements in this case closePos = m_cursor -1 ; // we put the element into stack // we think in this point that the element is empty... performFinalAction(elemName, closePos); // go to state 1 m_currState = 1; } if (( '>' != currChar ) && !isWhiteSpace(currChar)){ // we just read the first char from the attrib name or attrib value.. // go to state 5 m_currState = 5; // remember the position where starts the attrib or the value of an attrib attrStart = m_cursor - 1; } } // doState4 /** * '=' -> state 6 * '>' -> state 4 (we didn't read an attribute but a value of the * defaultAtt ) * WS (white spaces) we don't know yet if we read an attribute or the value * of the defaultAttr -> state 10 * This state modifies the content onf m_modifier ... it adds text */ private void doState5(char currChar){ if ( '=' == currChar ) m_currState = 6; if ( '>' == currChar ){ // this mean that the attribute was a value and we have to create // a default attribute // the same as in state 10 attrEnd = m_cursor - 1 ; m_modifier.insert(attrEnd,'"'); m_modifier.insert(attrStart,"defaultAttr=\""); // go to state 4 m_currState = 4; // parse again the entire sequence from state 4 before reading any char m_cursor = attrStart; } if (isWhiteSpace(currChar)){ // go to state 10 m_currState = 10; // record the position where ends this attribute attrEnd = m_cursor - 1; } } // doState5 /** * IF we read ' or " then we have to get prepared to read everything until * the next ' or " * If we read a char then -> state 8; * Stay here while we read WS */ private void doState6(char currChar){ if ( ('\'' == currChar) || ('"' == currChar) ){ endPair = currChar; if ('\'' == currChar){ // we have to replace ' with " m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\""); } m_currState = 7; } if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){ // this means that curChar is any char m_currState = 8; // every value must be inside this pair"" m_modifier.insert(m_cursor - 1, '"'); // insert implies the modification of m_cursor // we increment m_cursor in order to say in the same position and to // anulate the efect of insert. m_cursor ++; } }// doState6 /** * If we find the pair ' or " go to state 9 * Otherwhise read everything and stay in state 7 * If in state 7 we read '>' then we add automaticaly a " at the end and go * to state 1 */ private void doState7(char currChar){ //if ( ('\'' == currChar) || ('"' == currChar) ){ if ( endPair == currChar ){ if ('\'' == currChar){ // we have to replace ' with " m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\""); } // reset the endPair endPair = ' '; m_currState = 9; } if ('>' == currChar){ // go to state 1 m_currState = 1; // insert the final " ata the end m_modifier.insert(m_cursor - 1, '"'); // go to te current possition (because of insert) m_cursor ++; performFinalAction(elemName, m_cursor - 1); } }// doState7 /** * If '>' go to state 1 * If WS go to state 9 * Stays in state 8 and read the attribute's value */ private void doState8(char currChar){ if ('>' == currChar){ // go to state 1 m_currState = 1; // complete the end " ( state 5) * If '>' we just read a beggining tag -> state 1 * Stay here while read WS */ private void doState9(char currChar){ if ('>' == currChar){ // go to state 1 m_currState = 1; // add the object to the stack performFinalAction(elemName, m_cursor - 1); } if (('>' != currChar) && !isWhiteSpace(m_currChar)){ // this is the same as state 4->5 m_currState = 5; attrStart = m_cursor - 1; } }//doState9 /** * If any C -> state 4 * If '=' state 6 * Stays here while reads WS */ private void doState10(char currChar){ if ('=' == currChar) m_currState = 6; if ( ('=' != currChar) && !isWhiteSpace(currChar)){ // this mean that the attribute was a value and we have to create // a default attribute m_modifier.insert(attrEnd,'"'); m_modifier.insert(attrStart,"defaultAttr=\""); // go to state 4 m_currState = 4; m_cursor = attrStart; } }// doState10 /** * We are preparing to read the and definition of an element * Stays in this state while reading WS */ private void doState11(char currChar){ if (!isWhiteSpace(currChar)){ m_currState = 12; elemNameStart = m_cursor - 1; } } // doState11 /** * Here we read the element's name ...this is an end tag * Stays here while reads a char */ private void doState12(char currChar) { if ('>' == currChar){ elemNameEnd = m_cursor - 1; elemName = m_modifier.substring(elemNameStart,elemNameEnd); performActionWithEndElem(elemName); m_currState = 1; } if (isWhiteSpace(currChar)){ m_currState = 13; elemNameEnd = m_cursor - 1; } }//doState12 /** * If '>' -> state 1 * Stays here while reads WS */ private void doState13(char currChar) { if ('>' == currChar){ elemName = m_modifier.substring(elemNameStart,elemNameEnd); performActionWithEndElem(elemName); m_currState = 1; } } // doState13 /** This method is responsable with document conversion */ public String convert()throws IOException,MalformedURLException { while (thereAreCharsToBeProcessed()) { // read() gets the next char and increment the m_cursor m_currChar = read(); switch(m_currState){ case 1: doState1(m_currChar);break; case 2: doState2(m_currChar);break; case 3: doState3(m_currChar);break; case 4: doState4(m_currChar);break; case 5: doState5(m_currChar);break; case 6: doState6(m_currChar);break; case 7: doState7(m_currChar);break; case 8: doState8(m_currChar);break; case 9: doState9(m_currChar);break; case 10: doState10(m_currChar);break; case 11: doState11(m_currChar);break; case 12: doState12(m_currChar);break; case 13: doState13(m_currChar);break; }// switch(m_currState) }// while (thereAreCharsToBeProcessed()) // put all the elements from the stack into the dubiousElements list // we do that in order to colect all the dubious elements while (!stack.isEmpty()) { CustomObject obj = stack.pop(); dubiousElements.add(obj); } // sort the dubiousElements list descending on closePos... // This is vital for the alghorithm because we have to make // all the modifications from the bottom to the top... // If we fail to do that, insert will change indices and // CustomObject.getClosePos() will not be acurate anymore. Collections.sort(dubiousElements, new MyComparator()); //here we resolve all the dubious Elements... // see the description of makeFinalModifications() method ListIterator listIterator = dubiousElements.listIterator(); while (listIterator.hasNext()){ CustomObject obj = listIterator.next(); makeFinalModifications(obj); } //finally add the XML prolog m_modifier.insert(0,"\n"); //Out.println(m_modifier.toString()); /* // get a InputStream from m_modifier and write it into a temp file // finally return the URI of the new XML document ByteArrayInputStream is = new ByteArrayInputStream( m_modifier.toString().getBytes() ); */ // this method is in gate.util package File file = Files.writeTempFile(m_modifier.toString(),"UTF-8"); //return m_doc.getSourceURL().toString(); return file.toURI().toURL().toString(); }// convert() /** * This method tests to see if there are more char to be read * It will return false when there are no more chars to be read */ private boolean thereAreCharsToBeProcessed() { if (m_cursor < m_modifier.length()) return true; else return false; }//thereAreCharsToBeProcessed /** * This method reads a char and increments the m_cursor */ private char read(){ return m_modifier.charAt(m_cursor ++); }//read /** * This is the action when we finished to read the entire tag * The action means that we put the tag into stack and consider that is empty * as default */ private void performFinalAction(String elemName, int pos) { // create anew CustomObject CustomObject obj = new CustomObject(); // set its properties obj.setElemName(elemName); obj.setClosePos(pos); // default we consider every element to be empty // in state 1 we modify that if the element is followed by text obj.setEmpty(true); stack.push(obj); } // performFinalAction /** * This is the action performed when an end tag is read. * The action consists in colecting all the dubiosElements(elements without * an end tag). They are considered dubious because we don't know if they * are empty or may be closed... Only the DTD can provide this information. * We don't have a DTD so we will consider that all dubious elements * followed by text will close at the end of the text... * If a dubious element is followed by another element then is * automaticaly considered an empty element. * * @param elemName is the the name of the end tag that was read */ private void performActionWithEndElem(String elemName) { CustomObject obj = null; boolean stop = false; // get all the elements that are dubious from the stack // the iteration will stop when an element is equal with elemName while (!stack.isEmpty() && !stop){ // eliminate the object from the stack obj = stack.pop(); //if its elemName is equal with the param elemName we stop the itteration if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true; // otherwhise add the element to the doubiousElements list else dubiousElements.add(obj); } }//performActionWithEndElem /** * This method is called after we read the entire SGML document * It resolves the dobious Elements this way: *
    *
  • * 1. We don't have a DTD so we will consider that all dubious elements * followed by text will close at the end of the text... *
  • * 2. If a dubious element is followed by another element then is automaticaly considered an empty element. * * An element is considered dubious when we don't know if it is empty * or may be closed... * * @param aCustomObject an object from the dubiousElements list */ private void makeFinalModifications(CustomObject aCustomObject) { String endElement = null; // if the element is empty then we add / before > like this: // -> if (aCustomObject.isEmpty()) m_modifier.insert(aCustomObject.getClosePos(),"/"); // otherwhise we create an end element // -> else{ // create the end element endElement = ""; // insert it where the closePos indicates m_modifier.insert(aCustomObject.getClosePos(), endElement); } } // makeFinalModifications /** * Tests if c is a white space char */ private boolean isWhiteSpace(char c) { return Character.isWhitespace(c); } // this is a gate Document... It's content will be transferred to // m_modifier private Document m_doc = null; // this is the modifier that will transform an SGML document into an // XML document private StringBuffer m_modifier = null; // we need the stack to be able to remember the order of the tags private Stack stack = null; // this is a list with all the tags that are not colsed... // some of them are empty tags and some of them are not... private List dubiousElements = null; // this is tre current position inside the modifier private int m_cursor = 0; // the current state of the SGML2XML automata private int m_currState = 1; // the char that was read from the m_modifier @ position m_cursor private char m_currChar = ' '; // the fields above are used by the convert method and its auxiliary functions // like doState1...13() // indicates the last position of a text character (one which is not a white // space) // it is used in doState1() when we have to decide if an element is empty or // not // We decide that based on this field // If the charPos > 0 then it means that the object from the top of stack // is followed by text and we consider that is not empty private int charPos = 0; // is the current tag name private String elemName = null; // indicates where in the m_modifier begins the current tag elemName private int elemNameStart = 0; // indicates where in the m_modifier ends the current tag elemName // we need that in order to be able to read the current tag name // this name it will be read from m_modifier using the substring() method // it will be something like this : // elemName = m_modifier.substring(elemNameStart,elemNameEnd) // Eg: -> <[elemNameStart]w[elemNameEnd] [attr1=val1> private int elemNameEnd = 0; // this is the position there a start tag ends like this: // Eg: -> private int closePos = 0; //this is the position where an attribute starts... // we need it when we have to add the defaultAttr (see state 5) private int attrStart = 0; //this is the position where an attribute ends... // we need it when we have to add the defaultAttr (see state 5) or to add " // Eg: -> private int attrEnd = 0; // endPair field is used in states 6 and 7.... // When we read something like this : // attr=' val1 val2 val3' endPair remembers what is the pair for the beginning // string // Note that a combination like: attr = ' val1 val2 " will have an unexpected // behaviour... // We need this field when we have the following situation // attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ". // In this case we can't allow ' to be the endPair private char endPair = ' '; } // class Sgml2Xml /** * The objects belonging to this class are used inside the stack */ class CustomObject { // constructor public CustomObject() { elemName = null; closePos = 0; empty = false; } // accessor public String getElemName() { return elemName; } public int getClosePos() { return closePos; } public boolean isEmpty() { return empty; } // modifiers void setElemName(String anElemName) { elemName = anElemName; } void setClosePos(int aPos){ closePos = aPos; } void setEmpty(boolean anEmptyValue) { empty = anEmptyValue; } // data fields private String elemName = null; private int closePos = 0; private boolean empty = false; } // CustomObject class MyComparator implements Comparator, Serializable { private static final long serialVersionUID = -3559488985426858804L; public MyComparator() { } @Override public int compare(CustomObject co1, CustomObject co2) { int result = 0; if (co1.getClosePos() < co2.getClosePos()) result = -1; if (co1.getClosePos() == co2.getClosePos()) result = 0; if (co1.getClosePos() > co2.getClosePos()) result = 1; return -result; } // compare }// class MyComparator




© 2015 - 2024 Weber Informatics LLC | Privacy Policy