Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Sgml2Xml.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Cristian URSU, 4/July/2000
*
* $Id: Sgml2Xml.java 19660 2016-10-10 07:57:55Z markagreenwood $
*/
package gate.sgml;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.*;
import gate.Document;
import gate.util.Files;
/**
* Not so fast...
* This class is not a realy Sgml2Xml convertor.
* It takes an SGML document and tries to prepare it for an XML parser
* For a true conversion we need an Java SGML parser...
* If you know one let me know....
*
* What does it do:
*
*
If it finds something like this : <element attribute = value>
* it will produce: <element attribute = "value">
*
If it finds something like this : <element something
* attribute2=value>it will produce : <element
* defaultAttribute="something" attribute2="value">
*
If it finds : <element att1='value1 value2' att2="value2
* value3"> it will produce: <element att1="value1 value2"
* att2="value2 value3">
*
If it finds : <element1> <elem>text </element1>
* will produce: <element1> <elem>text<elem>
* </element1>
*
If it find : <element1> <elem>[white spaces]
* </element1>,
* it will produce:<element1> <elem/>[white spaces]<
* /element1>
*
* What doesn't:
*
*
Doesn't expand the entities. So the entities from the SGML document
* must be resolved by the XML parser
*
Doesn't replace internal entities with their corresponding value
*
*/
public class Sgml2Xml{
/**
* The constructor initialises some member fields
* @param SgmlDoc the content of the Sgml document that will be modified
*/
public Sgml2Xml(String SgmlDoc){
// create a new modifier
m_modifier = new StringBuffer(SgmlDoc);
// create a new dobiousElements list
// se the explanatin at the end of the class
dubiousElements = new ArrayList();
stack = new Stack();
}
/**
* The other constructor
* @param doc The Gate document that will be transformed to XML
*/
public Sgml2Xml(Document doc){
// set as a member
m_doc = doc;
// create a new modifier
m_modifier = new StringBuffer(m_doc.getContent().toString());
// create a new dobiousElements list
// se the explanatin at the end of the class
dubiousElements = new ArrayList();
stack = new Stack();
}
/* I keep this just in case I need some more debuging
public static void main(String[] args){
Sgml2Xml convertor =
new Sgml2Xml(" 0
if (charPos > 0){
// this is not an empty element because there is text that follows
// set the element from the top of the stack to be a non empty one
o.setClosePos(charPos);
o.setEmpty(false);
// reset the charPos
charPos = 0;
}//if (charPos > 0)
}//if (!stack.isEmpty())
}//if ('<' == m_currChar)
// if currChar is not whiteSpace then save the position of the last
// char that was read
if (('<' != currChar) && !isWhiteSpace(currChar))
charPos = m_cursor;
}//doState1
/**
We came from state 1 and just read '<'
If currChar == '/' -> state 11
If is a char != white spaces -> state 3
stay in state 2 while there are only white spaces
*/
private void doState2(char currChar){
if ('/' == currChar){
// go to state 11
m_currState = 11;
}
// if currChar is a char != white spaces then go to state 3
if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){
// save the position where starts the element's name
// we need that in order to be able to read the current tag name
// this name it will be read from m_modifier using the substring() method
elemNameStart = m_cursor -1;
// go to state 3
m_currState = 3;
}
}// doState2
/**
* Just read the first char from the element's name and now analize the next
* char.
* If '>' the elem name was a single char -> state 1
* IF is WhiteSpaces -> state 4
* Otherwise stay in state 3 and read the elemnt's name
*/
private void doState3(char currChar){
if ( '>' == currChar ){
// save the pos where the element's name ends
elemNameEnd = m_cursor - 1;
// this is also the pos where to insert '/' for empty elements.
// In this case we have this situation sau < w>
closePos = m_cursor - 1;
// get the name of the element
elemName = m_modifier.substring(elemNameStart,elemNameEnd);
// we put the element into stack
// we think in this point that the element is empty...
performFinalAction(elemName, closePos);
// go to state 1
m_currState = 1;
}
if (isWhiteSpace(currChar)){
// go to state 4
m_currState = 4;
// save the pos where the element's name ends
elemNameEnd = m_cursor - 1;
// get the name of the element
elemName = m_modifier.substring(elemNameStart,elemNameEnd);
}
}// doState3
/**
* We read the name of the element and we prepare for '>' or attributes
* '>' -> state 1
* any char !- white space -> state 5
*/
private void doState4(char currChar){
if ( '>' == currChar ){
// this is also the pos where to insert '/' for empty elements in this case
closePos = m_cursor -1 ;
// we put the element into stack
// we think in this point that the element is empty...
performFinalAction(elemName, closePos);
// go to state 1
m_currState = 1;
}
if (( '>' != currChar ) && !isWhiteSpace(currChar)){
// we just read the first char from the attrib name or attrib value..
// go to state 5
m_currState = 5;
// remember the position where starts the attrib or the value of an attrib
attrStart = m_cursor - 1;
}
} // doState4
/**
* '=' -> state 6
* '>' -> state 4 (we didn't read an attribute but a value of the
* defaultAtt )
* WS (white spaces) we don't know yet if we read an attribute or the value
* of the defaultAttr -> state 10
* This state modifies the content onf m_modifier ... it adds text
*/
private void doState5(char currChar){
if ( '=' == currChar )
m_currState = 6;
if ( '>' == currChar ){
// this mean that the attribute was a value and we have to create
// a default attribute
// the same as in state 10
attrEnd = m_cursor - 1 ;
m_modifier.insert(attrEnd,'"');
m_modifier.insert(attrStart,"defaultAttr=\"");
// go to state 4
m_currState = 4;
// parse again the entire sequence from state 4 before reading any char
m_cursor = attrStart;
}
if (isWhiteSpace(currChar)){
// go to state 10
m_currState = 10;
// record the position where ends this attribute
attrEnd = m_cursor - 1;
}
} // doState5
/**
* IF we read ' or " then we have to get prepared to read everything until
* the next ' or "
* If we read a char then -> state 8;
* Stay here while we read WS
*/
private void doState6(char currChar){
if ( ('\'' == currChar) || ('"' == currChar) ){
endPair = currChar;
if ('\'' == currChar){
// we have to replace ' with "
m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
}
m_currState = 7;
}
if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){
// this means that curChar is any char
m_currState = 8;
// every value must be inside this pair""
m_modifier.insert(m_cursor - 1, '"');
// insert implies the modification of m_cursor
// we increment m_cursor in order to say in the same position and to
// anulate the efect of insert.
m_cursor ++;
}
}// doState6
/**
* If we find the pair ' or " go to state 9
* Otherwhise read everything and stay in state 7
* If in state 7 we read '>' then we add automaticaly a " at the end and go
* to state 1
*/
private void doState7(char currChar){
//if ( ('\'' == currChar) || ('"' == currChar) ){
if ( endPair == currChar ){
if ('\'' == currChar){
// we have to replace ' with "
m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
}
// reset the endPair
endPair = ' ';
m_currState = 9;
}
if ('>' == currChar){
// go to state 1
m_currState = 1;
// insert the final " ata the end
m_modifier.insert(m_cursor - 1, '"');
// go to te current possition (because of insert)
m_cursor ++;
performFinalAction(elemName, m_cursor - 1);
}
}// doState7
/**
* If '>' go to state 1
* If WS go to state 9
* Stays in state 8 and read the attribute's value
*/
private void doState8(char currChar){
if ('>' == currChar){
// go to state 1
m_currState = 1;
// complete the end " ( state 5)
* If '>' we just read a beggining tag -> state 1
* Stay here while read WS
*/
private void doState9(char currChar){
if ('>' == currChar){
// go to state 1
m_currState = 1;
// add the object to the stack
performFinalAction(elemName, m_cursor - 1);
}
if (('>' != currChar) && !isWhiteSpace(m_currChar)){
// this is the same as state 4->5
m_currState = 5;
attrStart = m_cursor - 1;
}
}//doState9
/**
* If any C -> state 4
* If '=' state 6
* Stays here while reads WS
*/
private void doState10(char currChar){
if ('=' == currChar)
m_currState = 6;
if ( ('=' != currChar) && !isWhiteSpace(currChar)){
// this mean that the attribute was a value and we have to create
// a default attribute
m_modifier.insert(attrEnd,'"');
m_modifier.insert(attrStart,"defaultAttr=\"");
// go to state 4
m_currState = 4;
m_cursor = attrStart;
}
}// doState10
/**
* We are preparing to read the and definition of an element
* Stays in this state while reading WS
*/
private void doState11(char currChar){
if (!isWhiteSpace(currChar)){
m_currState = 12;
elemNameStart = m_cursor - 1;
}
} // doState11
/**
* Here we read the element's name ...this is an end tag
* Stays here while reads a char
*/
private void doState12(char currChar) {
if ('>' == currChar){
elemNameEnd = m_cursor - 1;
elemName = m_modifier.substring(elemNameStart,elemNameEnd);
performActionWithEndElem(elemName);
m_currState = 1;
}
if (isWhiteSpace(currChar)){
m_currState = 13;
elemNameEnd = m_cursor - 1;
}
}//doState12
/**
* If '>' -> state 1
* Stays here while reads WS
*/
private void doState13(char currChar) {
if ('>' == currChar){
elemName = m_modifier.substring(elemNameStart,elemNameEnd);
performActionWithEndElem(elemName);
m_currState = 1;
}
} // doState13
/**
This method is responsable with document conversion
*/
public String convert()throws IOException,MalformedURLException {
while (thereAreCharsToBeProcessed()) {
// read() gets the next char and increment the m_cursor
m_currChar = read();
switch(m_currState){
case 1: doState1(m_currChar);break;
case 2: doState2(m_currChar);break;
case 3: doState3(m_currChar);break;
case 4: doState4(m_currChar);break;
case 5: doState5(m_currChar);break;
case 6: doState6(m_currChar);break;
case 7: doState7(m_currChar);break;
case 8: doState8(m_currChar);break;
case 9: doState9(m_currChar);break;
case 10: doState10(m_currChar);break;
case 11: doState11(m_currChar);break;
case 12: doState12(m_currChar);break;
case 13: doState13(m_currChar);break;
}// switch(m_currState)
}// while (thereAreCharsToBeProcessed())
// put all the elements from the stack into the dubiousElements list
// we do that in order to colect all the dubious elements
while (!stack.isEmpty()) {
CustomObject obj = stack.pop();
dubiousElements.add(obj);
}
// sort the dubiousElements list descending on closePos...
// This is vital for the alghorithm because we have to make
// all the modifications from the bottom to the top...
// If we fail to do that, insert will change indices and
// CustomObject.getClosePos() will not be acurate anymore.
Collections.sort(dubiousElements, new MyComparator());
//here we resolve all the dubious Elements...
// see the description of makeFinalModifications() method
ListIterator listIterator = dubiousElements.listIterator();
while (listIterator.hasNext()){
CustomObject obj = listIterator.next();
makeFinalModifications(obj);
}
//finally add the XML prolog
m_modifier.insert(0,"\n");
//Out.println(m_modifier.toString());
/*
// get a InputStream from m_modifier and write it into a temp file
// finally return the URI of the new XML document
ByteArrayInputStream is = new ByteArrayInputStream(
m_modifier.toString().getBytes()
);
*/
// this method is in gate.util package
File file = Files.writeTempFile(m_modifier.toString(),"UTF-8");
//return m_doc.getSourceURL().toString();
return file.toURI().toURL().toString();
}// convert()
/**
* This method tests to see if there are more char to be read
* It will return false when there are no more chars to be read
*/
private boolean thereAreCharsToBeProcessed() {
if (m_cursor < m_modifier.length()) return true;
else return false;
}//thereAreCharsToBeProcessed
/**
* This method reads a char and increments the m_cursor
*/
private char read(){
return m_modifier.charAt(m_cursor ++);
}//read
/**
* This is the action when we finished to read the entire tag
* The action means that we put the tag into stack and consider that is empty
* as default
*/
private void performFinalAction(String elemName, int pos) {
// create anew CustomObject
CustomObject obj = new CustomObject();
// set its properties
obj.setElemName(elemName);
obj.setClosePos(pos);
// default we consider every element to be empty
// in state 1 we modify that if the element is followed by text
obj.setEmpty(true);
stack.push(obj);
} // performFinalAction
/**
* This is the action performed when an end tag is read.
* The action consists in colecting all the dubiosElements(elements without
* an end tag). They are considered dubious because we don't know if they
* are empty or may be closed... Only the DTD can provide this information.
* We don't have a DTD so we will consider that all dubious elements
* followed by text will close at the end of the text...
* If a dubious element is followed by another element then is
* automaticaly considered an empty element.
*
* @param elemName is the the name of the end tag that was read
*/
private void performActionWithEndElem(String elemName) {
CustomObject obj = null;
boolean stop = false;
// get all the elements that are dubious from the stack
// the iteration will stop when an element is equal with elemName
while (!stack.isEmpty() && !stop){
// eliminate the object from the stack
obj = stack.pop();
//if its elemName is equal with the param elemName we stop the itteration
if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true;
// otherwhise add the element to the doubiousElements list
else dubiousElements.add(obj);
}
}//performActionWithEndElem
/**
* This method is called after we read the entire SGML document
* It resolves the dobious Elements this way:
*
*
* 1. We don't have a DTD so we will consider that all dubious elements
* followed by text will close at the end of the text...
*
* 2. If a dubious element is followed by another element then is
automaticaly considered an empty element.
*
* An element is considered dubious when we don't know if it is empty
* or may be closed...
*
* @param aCustomObject an object from the dubiousElements list
*/
private void makeFinalModifications(CustomObject aCustomObject) {
String endElement = null;
// if the element is empty then we add / before > like this:
// ->
if (aCustomObject.isEmpty())
m_modifier.insert(aCustomObject.getClosePos(),"/");
// otherwhise we create an end element
// ->
else{
// create the end element
endElement = "";
// insert it where the closePos indicates
m_modifier.insert(aCustomObject.getClosePos(), endElement);
}
} // makeFinalModifications
/**
* Tests if c is a white space char
*/
private boolean isWhiteSpace(char c) {
return Character.isWhitespace(c);
}
// this is a gate Document... It's content will be transferred to
// m_modifier
private Document m_doc = null;
// this is the modifier that will transform an SGML document into an
// XML document
private StringBuffer m_modifier = null;
// we need the stack to be able to remember the order of the tags
private Stack stack = null;
// this is a list with all the tags that are not colsed...
// some of them are empty tags and some of them are not...
private List dubiousElements = null;
// this is tre current position inside the modifier
private int m_cursor = 0;
// the current state of the SGML2XML automata
private int m_currState = 1;
// the char that was read from the m_modifier @ position m_cursor
private char m_currChar = ' ';
// the fields above are used by the convert method and its auxiliary functions
// like doState1...13()
// indicates the last position of a text character (one which is not a white
// space)
// it is used in doState1() when we have to decide if an element is empty or
// not
// We decide that based on this field
// If the charPos > 0 then it means that the object from the top of stack
// is followed by text and we consider that is not empty
private int charPos = 0;
// is the current tag name
private String elemName = null;
// indicates where in the m_modifier begins the current tag elemName
private int elemNameStart = 0;
// indicates where in the m_modifier ends the current tag elemName
// we need that in order to be able to read the current tag name
// this name it will be read from m_modifier using the substring() method
// it will be something like this :
// elemName = m_modifier.substring(elemNameStart,elemNameEnd)
// Eg: -> <[elemNameStart]w[elemNameEnd] [attr1=val1>
private int elemNameEnd = 0;
// this is the position there a start tag ends like this:
// Eg: ->
private int closePos = 0;
//this is the position where an attribute starts...
// we need it when we have to add the defaultAttr (see state 5)
private int attrStart = 0;
//this is the position where an attribute ends...
// we need it when we have to add the defaultAttr (see state 5) or to add "
// Eg: ->
private int attrEnd = 0;
// endPair field is used in states 6 and 7....
// When we read something like this :
// attr=' val1 val2 val3' endPair remembers what is the pair for the beginning
// string
// Note that a combination like: attr = ' val1 val2 " will have an unexpected
// behaviour...
// We need this field when we have the following situation
// attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ".
// In this case we can't allow ' to be the endPair
private char endPair = ' ';
} // class Sgml2Xml
/**
* The objects belonging to this class are used inside the stack
*/
class CustomObject {
// constructor
public CustomObject() {
elemName = null;
closePos = 0;
empty = false;
}
// accessor
public String getElemName() {
return elemName;
}
public int getClosePos() {
return closePos;
}
public boolean isEmpty() {
return empty;
}
// modifiers
void setElemName(String anElemName) {
elemName = anElemName;
}
void setClosePos(int aPos){
closePos = aPos;
}
void setEmpty(boolean anEmptyValue) {
empty = anEmptyValue;
}
// data fields
private String elemName = null;
private int closePos = 0;
private boolean empty = false;
} // CustomObject
class MyComparator implements Comparator, Serializable {
private static final long serialVersionUID = -3559488985426858804L;
public MyComparator() {
}
@Override
public int compare(CustomObject co1, CustomObject co2) {
int result = 0;
if (co1.getClosePos() < co2.getClosePos()) result = -1;
if (co1.getClosePos() == co2.getClosePos()) result = 0;
if (co1.getClosePos() > co2.getClosePos()) result = 1;
return -result;
} // compare
}// class MyComparator