All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.tinygroup.xmlparser.parser.XmlStringParser Maven / Gradle / Ivy

There is a newer version: 3.4.9
Show newest version
/**
 *  Copyright (c) 1997-2013, www.tinygroup.org ([email protected]).
 *
 *  Licensed under the GPL, Version 3.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *       http://www.gnu.org/licenses/gpl.html
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.tinygroup.xmlparser.parser;

import org.tinygroup.xmlparser.XmlDocument;
import org.tinygroup.xmlparser.XmlNodeType;
import org.tinygroup.xmlparser.document.XmlDocumentImpl;
import org.tinygroup.xmlparser.node.XmlNode;

import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class XmlStringParser extends XmlParser {
    private static final String TAIL_END_PATTERN = "TailEndPattern";
    public static final String HEAD_END_PATTERN = "HeadEndPattern";
    private int start = 0;
	private static Pattern endTagName = Pattern
			.compile("(\\w|[\u4e00-\u9fa5]|[.]|[:]|[-])+\\s*>");// 头标签 名正则表达式
	// ,以‘>’结束,不管之前是否为空格符
	private static Pattern startTagName = Pattern
			.compile("(\\w|[\u4e00-\u9fa5]|[-]|[:]|[.])+");// 头标签名正则表达式
	private static Pattern attribute = Pattern // 结点标签属性的正则表达式
			.compile("(\\b(\\w|[\u4e00-\u9fa5]|[/]|[:]|[.]|[-])+\\s*=\\s*\"[^\"]*\")|(\\b(\\w|[\u4e00-\u9fa5]|[/]|[.]|[:]|[-])+\\s*=\\s*'[^']*')|(\\b(\\w|[\u4e00-\u9fa5]|[/]|[.]|[-])+\\s*=\\s*(\\w|[\u4e00-\u9fa5]|[.]|[-])+)");
	private static Map patternTable = new HashMap();

	private String parseNode(String xmlSource, XmlNode pnode) {
		Pattern pattern = Pattern.compile(getHeadStartPattern());
		Matcher matcher = pattern.matcher(xmlSource);
		matcher.region(start, xmlSource.length());// 匹配域限制
		nexttag: while (matcher.find()) {
			if (start < matcher.start()) {
				String str = xmlSource.substring(start, matcher.start());
				if (str.trim().length() > 0) {
					XmlNode node = new XmlNode(XmlNodeType.TEXT);
					node.setContent(str);
					pnode.addNode(node);
				}
				start = matcher.start();
			}
			String headStart = xmlSource.substring(matcher.start(),
					matcher.end());// 开头匹配字符串
			if (headStart.equals("/>")) {
				// 结束,返回节点
				start = matcher.end();
				return null;
			} else if (headStart.equals("', matcher.end()));
						parseHeader(xmlSource, node, m);
					}
					// 读入头标签结束符
					if (nodetype.getHead().getEnd() != null) {
						Pattern p = patternTable.get(nodetype
								+ HEAD_END_PATTERN);
						if (p == null) {
							p = Pattern.compile(getHeadEndPattern(nodetype));
							patternTable.put(nodetype + HEAD_END_PATTERN, p);
						}
						Matcher m = p.matcher(xmlSource);
						m.region(start, xmlSource.length());
						if (m.find()) {
							char c = xmlSource.charAt(m.start() - 1);
							start = m.end();
							if (c != '/') {
								if (nodetype.isHasBody()) {// 如果结点有子结点,嵌套读取子结点
									String r = parseNode(xmlSource, node);
									if (r != null) {
										if (r.endsWith(pnode.getNodeName())) {
											return null;
										} else {
											return r;
										}
									} else {
										matcher.region(start,
												xmlSource.length());
										continue nexttag;
									}
								}
							} else {// 如果结点为文本内容,继续对其文本内容进行解析
								matcher.region(start, xmlSource.length());
								continue nexttag;
							}
						}
					}
					// 读入尾标签
					if (nodetype.getTail() != null
							&& nodetype.getTail().getEnd() != null) {
						Pattern p = patternTable.get(nodetype
								+ TAIL_END_PATTERN);
						if (p == null) {
							p = Pattern.compile(getTailEndPattern(nodetype));
							patternTable.put(nodetype + TAIL_END_PATTERN, p);
						}
						Matcher m = p.matcher(xmlSource);
						m.region(matcher.end(), xmlSource.length());
						if (m.find()) {
							node.setContent(xmlSource.substring(matcher.end(),
									m.start()));
							start = m.end();
							matcher.region(start, xmlSource.length());
							continue nexttag;
						}
					}
				}
			}
		}
		return null;
	}

	private void parseHeader(String xmlSource, XmlNode node, Matcher m) {
		while (m.find()) {
			String str = xmlSource.substring(m.start(), m.end());
			String k = str.substring(0, str.indexOf('=')).trim();
			String v = str.substring(str.indexOf('=') + 1).trim();
			if (v.startsWith("\"")) {
				v = v.substring(1, v.length() - 1);
			} else if (v.startsWith("'")) {
				v = v.substring(1, v.length() - 1);
			}
			node.setAttribute(k, v);
			start = m.end();
		}
	}

	/**
	 * 解析xml文档
	 * 
	 * @param xmlSource
	 * @return XmlDocument
	 */
	public XmlDocument parse(String xmlSource) {
		XmlDocument document = new XmlDocumentImpl();
		Pattern pattern = Pattern.compile(getHeadStartPattern());
		Matcher matcher = pattern.matcher(xmlSource);
		nexttag: while (matcher.find()) {
			// 前面的无效字符丢弃
			start = matcher.end();
			String headStart = matcher.group();
			for (XmlNodeType nt : XmlNodeType.values()) {
				if (nt.getHead() != null && nt.getHead().getStart() != null
						&& nt.getHead().getStart().equals(headStart)) {// 查找NodeType,在Document中添加相应NodeType结点
					XmlNode node = new XmlNode(nt);
					if (nt == XmlNodeType.ELEMENT) {
						document.setRoot(node);
					} else if (nt == XmlNodeType.COMMENT) {
						document.addComment(node);
					} else if (nt == XmlNodeType.DOCTYPE) {
						document.addDoctype(node);
					} else if (nt == XmlNodeType.XML_DECLARATION) {
						document.setXmlDeclaration(node);
					} else if (nt == XmlNodeType.PROCESSING_INSTRUCTION) {
						document.addProcessingInstruction(node);
					}
					// 读入标签名称
					if (nt == XmlNodeType.ELEMENT) {
						Matcher m = startTagName.matcher(xmlSource);
						m.region(start, xmlSource.length());
						if (m.find()) {
							node.setNodeName(xmlSource.substring(m.start(),
									m.end()));
							start = m.end();
						}
					}
					if (nt.isHasHeader()) {// 为结点添加属性信息
						Matcher m = attribute.matcher(xmlSource);
						m.region(start, xmlSource.indexOf('>', matcher.end()));
						parseHeader(xmlSource, node, m);
					}
					// 读入头标签结束符
					if (nt.getHead().getEnd() != null) {
						Pattern p = patternTable.get(nt + HEAD_END_PATTERN);
						if (p == null) {
							p = Pattern.compile(getHeadEndPattern(nt));
							patternTable.put(nt + HEAD_END_PATTERN, p);
						}
						Matcher m = p.matcher(xmlSource);
						m.region(matcher.end(), xmlSource.length());
						if (m.find()) {
							start = m.end();
							if (nt.isHasBody()) {
								parseNode(xmlSource, node);
								matcher.region(start, xmlSource.length());
								continue nexttag;
							}
						}
					}
					// 读入结尾标签
					if (nt.getTail() != null && nt.getTail().getEnd() != null) {
						Pattern p = patternTable.get(nt + TAIL_END_PATTERN);
						if (p == null) {
							p = Pattern.compile(getTailEndPattern(nt));
							patternTable.put(nt + TAIL_END_PATTERN, p);
						}
						Matcher m = p.matcher(xmlSource);
						m.region(start, xmlSource.length());
						if (m.find()) {
							node.setContent(xmlSource.substring(matcher.end(),
									m.start()));
							start = m.end();
							matcher.region(start, xmlSource.length());
							continue nexttag;
						}
					}
				}
			}
		}
		return document;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy