All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.juneau.html.HtmlParserSession Maven / Gradle / Ivy

There is a newer version: 9.0.1
Show newest version
// ***************************************************************************************************************************
// * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file *
// * distributed with this work for additional information regarding copyright ownership.  The ASF licenses this file        *
// * to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance            *
// * with the License.  You may obtain a copy of the License at                                                              *
// *                                                                                                                         *
// *  http://www.apache.org/licenses/LICENSE-2.0                                                                             *
// *                                                                                                                         *
// * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an  *
// * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the License for the        *
// * specific language governing permissions and limitations under the License.                                              *
// ***************************************************************************************************************************
package org.apache.juneau.html;

import static javax.xml.stream.XMLStreamConstants.*;
import static org.apache.juneau.html.HtmlTag.*;
import static org.apache.juneau.internal.StringUtils.*;

import java.lang.reflect.*;
import java.util.*;

import javax.xml.stream.*;

import org.apache.juneau.*;
import org.apache.juneau.html.annotation.*;
import org.apache.juneau.parser.*;
import org.apache.juneau.transform.*;
import org.apache.juneau.xml.*;

/**
 * Session object that lives for the duration of a single use of {@link HtmlParser}.
 *
 * 

* This class is NOT thread safe. * It is typically discarded after one-time use although it can be reused against multiple inputs. */ @SuppressWarnings({ "unchecked", "rawtypes" }) public final class HtmlParserSession extends XmlParserSession { private static final Set whitespaceElements = new HashSet<>( Arrays.asList( new String[]{"br","bs","sp","ff"} ) ); /** * Create a new session using properties specified in the context. * * @param ctx * The context creating this session object. * The context contains all the configuration settings for this object. * @param args * Runtime session arguments. */ protected HtmlParserSession(HtmlParser ctx, ParserSessionArgs args) { super(ctx, args); } @Override /* ParserSession */ protected T doParse(ParserPipe pipe, ClassMeta type) throws Exception { return parseAnything(type, getXmlReader(pipe), getOuter(), true, null); } @Override /* ReaderParserSession */ protected Map doParseIntoMap(ParserPipe pipe, Map m, Type keyType, Type valueType) throws Exception { return parseIntoMap(getXmlReader(pipe), m, (ClassMeta)getClassMeta(keyType), (ClassMeta)getClassMeta(valueType), null); } @Override /* ReaderParserSession */ protected Collection doParseIntoCollection(ParserPipe pipe, Collection c, Type elementType) throws Exception { return parseIntoCollection(getXmlReader(pipe), c, getClassMeta(elementType), null); } /* * Reads anything starting at the current event. *

* Precondition: Must be pointing at outer START_ELEMENT. * Postcondition: Pointing at outer END_ELEMENT. */ private T parseAnything(ClassMeta eType, XmlReader r, Object outer, boolean isRoot, BeanPropertyMeta pMeta) throws Exception { if (eType == null) eType = (ClassMeta)object(); PojoSwap swap = (PojoSwap)eType.getPojoSwap(this); BuilderSwap builder = (BuilderSwap)eType.getBuilderSwap(this); ClassMeta sType = null; if (builder != null) sType = builder.getBuilderClassMeta(this); else if (swap != null) sType = swap.getSwapClassMeta(this); else sType = eType; setCurrentClass(sType); int event = r.getEventType(); if (event != START_ELEMENT) throw new ParseException(this, "parseAnything must be called on outer start element."); if (! isRoot) event = r.next(); boolean isEmpty = (event == END_ELEMENT); // Skip until we find a start element, end document, or non-empty text. if (! isEmpty) event = skipWs(r); if (event == END_DOCUMENT) throw new ParseException(this, "Unexpected end of stream in parseAnything for type ''{0}''", eType); // Handle @Html(asXml=true) beans. HtmlClassMeta hcm = sType.getExtendedMeta(HtmlClassMeta.class); if (hcm.getFormat() == HtmlFormat.XML) return super.parseAnything(eType, null, r, outer, false, pMeta); Object o = null; boolean isValid = true; HtmlTag tag = (event == CHARACTERS ? null : HtmlTag.forString(r.getName().getLocalPart(), false)); // If it's not a known tag, then parse it as XML. // Allows us to parse stuff like "

" into HTML5 beans. if (tag == null && event != CHARACTERS) return super.parseAnything(eType, null, r, outer, false, pMeta); if (tag == HTML) tag = skipToData(r); if (isEmpty) { o = ""; } else if (tag == null || tag.isOneOf(BR,BS,FF,SP)) { String text = parseText(r); if (sType.isObject() || sType.isCharSequence()) o = text; else if (sType.isChar()) o = parseCharacter(text); else if (sType.isBoolean()) o = Boolean.parseBoolean(text); else if (sType.isNumber()) o = parseNumber(text, (Class)eType.getInnerClass()); else if (sType.canCreateNewInstanceFromString(outer)) o = sType.newInstanceFromString(outer, text); else if (sType.canCreateNewInstanceFromNumber(outer)) o = sType.newInstanceFromNumber(this, outer, parseNumber(text, sType.getNewInstanceFromNumberClass())); else isValid = false; } else if (tag == STRING || (tag == A && pMeta != null && pMeta.getExtendedMeta(HtmlBeanPropertyMeta.class).getLink() != null)) { String text = getElementText(r); if (sType.isObject() || sType.isCharSequence()) o = text; else if (sType.isChar()) o = parseCharacter(text); else if (sType.canCreateNewInstanceFromString(outer)) o = sType.newInstanceFromString(outer, text); else if (sType.canCreateNewInstanceFromNumber(outer)) o = sType.newInstanceFromNumber(this, outer, parseNumber(text, sType.getNewInstanceFromNumberClass())); else isValid = false; skipTag(r, tag == STRING ? xSTRING : xA); } else if (tag == NUMBER) { String text = getElementText(r); if (sType.isObject()) o = parseNumber(text, Number.class); else if (sType.isNumber()) o = parseNumber(text, (Class)sType.getInnerClass()); else if (sType.canCreateNewInstanceFromNumber(outer)) o = sType.newInstanceFromNumber(this, outer, parseNumber(text, sType.getNewInstanceFromNumberClass())); else isValid = false; skipTag(r, xNUMBER); } else if (tag == BOOLEAN) { String text = getElementText(r); if (sType.isObject() || sType.isBoolean()) o = Boolean.parseBoolean(text); else isValid = false; skipTag(r, xBOOLEAN); } else if (tag == P) { String text = getElementText(r); if (! "No Results".equals(text)) isValid = false; skipTag(r, xP); } else if (tag == NULL) { skipTag(r, NULL); skipTag(r, xNULL); } else if (tag == A) { o = parseAnchor(r, eType); skipTag(r, xA); } else if (tag == TABLE) { String typeName = getAttribute(r, getBeanTypePropertyName(eType), "object"); ClassMeta cm = getClassMeta(typeName, pMeta, eType); if (cm != null) { sType = eType = cm; typeName = sType.isCollectionOrArray() ? "array" : "object"; } else if (! "array".equals(typeName)) { // Type name could be a subtype name. typeName = sType.isCollectionOrArray() ? "array" : "object"; } if (typeName.equals("object")) { if (sType.isObject()) { o = parseIntoMap(r, (Map)new ObjectMap(this), sType.getKeyType(), sType.getValueType(), pMeta); } else if (sType.isMap()) { o = parseIntoMap(r, (Map)(sType.canCreateNewInstance(outer) ? sType.newInstance(outer) : new ObjectMap(this)), sType.getKeyType(), sType.getValueType(), pMeta); } else if (builder != null) { BeanMap m = toBeanMap(builder.create(this, eType)); o = builder.build(this, parseIntoBean(r, m).getBean(), eType); } else if (sType.canCreateNewBean(outer)) { BeanMap m = newBeanMap(outer, sType.getInnerClass()); o = parseIntoBean(r, m).getBean(); } else { isValid = false; } skipTag(r, xTABLE); } else if (typeName.equals("array")) { if (sType.isObject()) o = parseTableIntoCollection(r, (Collection)new ObjectList(this), sType, pMeta); else if (sType.isCollection()) o = parseTableIntoCollection(r, (Collection)(sType.canCreateNewInstance(outer) ? sType.newInstance(outer) : new ObjectList(this)), sType, pMeta); else if (sType.isArray() || sType.isArgs()) { ArrayList l = (ArrayList)parseTableIntoCollection(r, new ArrayList(), sType, pMeta); o = toArray(sType, l); } else isValid = false; skipTag(r, xTABLE); } else { isValid = false; } } else if (tag == UL) { String typeName = getAttribute(r, getBeanTypePropertyName(eType), "array"); ClassMeta cm = getClassMeta(typeName, pMeta, eType); if (cm != null) sType = eType = cm; if (sType.isObject()) o = parseIntoCollection(r, new ObjectList(this), sType, pMeta); else if (sType.isCollection() || sType.isObject()) o = parseIntoCollection(r, (Collection)(sType.canCreateNewInstance(outer) ? sType.newInstance(outer) : new ObjectList(this)), sType, pMeta); else if (sType.isArray() || sType.isArgs()) o = toArray(sType, parseIntoCollection(r, new ArrayList(), sType, pMeta)); else isValid = false; skipTag(r, xUL); } if (! isValid) throw new ParseException(this, "Unexpected tag ''{0}'' for type ''{1}''", tag, eType); if (swap != null && o != null) o = swap.unswap(this, o, eType); if (outer != null) setParent(eType, o, outer); skipWs(r); return (T)o; } /* * For parsing output from HtmlDocSerializer, this skips over the head, title, and links. */ private HtmlTag skipToData(XmlReader r) throws Exception { while (true) { int event = r.next(); if (event == START_ELEMENT && "div".equals(r.getLocalName()) && "data".equals(r.getAttributeValue(null, "id"))) { r.nextTag(); event = r.getEventType(); boolean isEmpty = (event == END_ELEMENT); // Skip until we find a start element, end document, or non-empty text. if (! isEmpty) event = skipWs(r); if (event == END_DOCUMENT) throw new ParseException(this, "Unexpected end of stream looking for data."); return (event == CHARACTERS ? null : HtmlTag.forString(r.getName().getLocalPart(), false)); } } } private static String getAttribute(XmlReader r, String name, String def) { for (int i = 0; i < r.getAttributeCount(); i++) if (r.getAttributeLocalName(i).equals(name)) return r.getAttributeValue(i); return def; } /* * Reads an anchor tag and converts it into a bean. */ private T parseAnchor(XmlReader r, ClassMeta beanType) throws Exception { String href = r.getAttributeValue(null, "href"); String name = getElementText(r); Class beanClass = beanType.getInnerClass(); if (beanClass.isAnnotationPresent(HtmlLink.class)) { HtmlLink h = beanClass.getAnnotation(HtmlLink.class); BeanMap m = newBeanMap(beanClass); m.put(h.uriProperty(), href); m.put(h.nameProperty(), name); return m.getBean(); } return convertToType(href, beanType); } private static Map getAttributes(XmlReader r) { Map m = new TreeMap<>() ; for (int i = 0; i < r.getAttributeCount(); i++) m.put(r.getAttributeLocalName(i), r.getAttributeValue(i)); return m; } /* * Reads contents of element. * Precondition: Must be pointing at
event. * Postcondition: Pointing at next START_ELEMENT or END_DOCUMENT event. */ private Map parseIntoMap(XmlReader r, Map m, ClassMeta keyType, ClassMeta valueType, BeanPropertyMeta pMeta) throws Exception { while (true) { HtmlTag tag = nextTag(r, TR, xTABLE); if (tag == xTABLE) break; tag = nextTag(r, TD, TH); // Skip over the column headers. if (tag == TH) { skipTag(r); r.nextTag(); skipTag(r); } else { K key = parseAnything(keyType, r, m, false, pMeta); nextTag(r, TD); V value = parseAnything(valueType, r, m, false, pMeta); setName(valueType, value, key); m.put(key, value); } nextTag(r, xTR); } return m; } /* * Reads contents of
    element. * Precondition: Must be pointing at event following
      event. * Postcondition: Pointing at next START_ELEMENT or END_DOCUMENT event. */ private Collection parseIntoCollection(XmlReader r, Collection l, ClassMeta type, BeanPropertyMeta pMeta) throws Exception { int argIndex = 0; while (true) { HtmlTag tag = nextTag(r, LI, xUL); if (tag == xUL) break; ClassMeta elementType = type.isArgs() ? type.getArg(argIndex++) : type.getElementType(); l.add((E)parseAnything(elementType, r, l, false, pMeta)); } return l; } /* * Reads contents of
        element. * Precondition: Must be pointing at event following
          event. * Postcondition: Pointing at next START_ELEMENT or END_DOCUMENT event. */ private Collection parseTableIntoCollection(XmlReader r, Collection l, ClassMeta type, BeanPropertyMeta pMeta) throws Exception { HtmlTag tag = nextTag(r, TR); List keys = new ArrayList<>(); while (true) { tag = nextTag(r, TH, xTR); if (tag == xTR) break; keys.add(getElementText(r)); } int argIndex = 0; while (true) { r.nextTag(); tag = HtmlTag.forEvent(this, r); if (tag == xTABLE) break; ClassMeta elementType = null; String beanType = getAttribute(r, getBeanTypePropertyName(type), null); if (beanType != null) elementType = getClassMeta(beanType, pMeta, null); if (elementType == null) elementType = type.isArgs() ? type.getArg(argIndex++) : type.getElementType(); if (elementType == null) elementType = object(); BuilderSwap builder = elementType.getBuilderSwap(this); if (builder != null || elementType.canCreateNewBean(l)) { BeanMap m = builder != null ? toBeanMap(builder.create(this, elementType)) : newBeanMap(l, elementType.getInnerClass()) ; for (int i = 0; i < keys.size(); i++) { tag = nextTag(r, TD, NULL); if (tag == NULL) { m = null; nextTag(r, xNULL); break; } String key = keys.get(i); BeanMapEntry e = m.getProperty(key); if (e == null) { //onUnknownProperty(key, m, -1, -1); parseAnything(object(), r, l, false, null); } else { BeanPropertyMeta bpm = e.getMeta(); ClassMeta cm = bpm.getClassMeta(); Object value = parseAnything(cm, r, m.getBean(false), false, bpm); setName(cm, value, key); bpm.set(m, key, value); } } l.add( m == null ? null : builder != null ? builder.build(this, m.getBean(), elementType) : (E)m.getBean() ); } else { String c = getAttributes(r).get(getBeanTypePropertyName(type.getElementType())); Map m = (Map)(elementType.isMap() && elementType.canCreateNewInstance(l) ? elementType.newInstance(l) : new ObjectMap(this)); for (int i = 0; i < keys.size(); i++) { tag = nextTag(r, TD, NULL); if (tag == NULL) { m = null; nextTag(r, xNULL); break; } String key = keys.get(i); if (m != null) { ClassMeta kt = elementType.getKeyType(), vt = elementType.getValueType(); Object value = parseAnything(vt, r, l, false, pMeta); setName(vt, value, key); m.put(convertToType(key, kt), value); } } if (m != null && c != null) { ObjectMap m2 = (m instanceof ObjectMap ? (ObjectMap)m : new ObjectMap(m).setBeanSession(this)); m2.put(getBeanTypePropertyName(type.getElementType()), c); l.add((E)cast(m2, pMeta, elementType)); } else { l.add((E)m); } } nextTag(r, xTR); } return l; } /* * Reads contents of
element. * Precondition: Must be pointing at event following
event. * Postcondition: Pointing at next START_ELEMENT or END_DOCUMENT event. */ private BeanMap parseIntoBean(XmlReader r, BeanMap m) throws Exception { while (true) { HtmlTag tag = nextTag(r, TR, xTABLE); if (tag == xTABLE) break; tag = nextTag(r, TD, TH); // Skip over the column headers. if (tag == TH) { skipTag(r); r.nextTag(); skipTag(r); } else { String key = getElementText(r); nextTag(r, TD); BeanPropertyMeta pMeta = m.getPropertyMeta(key); if (pMeta == null) { onUnknownProperty(key, m); parseAnything(object(), r, null, false, null); } else { ClassMeta cm = pMeta.getClassMeta(); Object value = parseAnything(cm, r, m.getBean(false), false, pMeta); setName(cm, value, key); pMeta.set(m, key, value); } } nextTag(r, xTR); } return m; } /* * Reads the next tag. Advances past anything that's not a start or end tag. Throws an exception if * it's not one of the expected tags. * Precondition: Must be pointing before the event we want to parse. * Postcondition: Pointing at the tag just parsed. */ private HtmlTag nextTag(XmlReader r, HtmlTag...expected) throws Exception { int et = r.next(); while (et != START_ELEMENT && et != END_ELEMENT && et != END_DOCUMENT) et = r.next(); if (et == END_DOCUMENT) throw new ParseException(this, "Unexpected end of document."); HtmlTag tag = HtmlTag.forEvent(this, r); if (expected.length == 0) return tag; for (HtmlTag t : expected) if (t == tag) return tag; throw new ParseException(this, "Unexpected tag: ''{0}''. Expected one of the following: {1}", tag, expected); } /* * Skips over the current element and advances to the next element. *

* Precondition: Pointing to opening tag. * Postcondition: Pointing to next opening tag. * * @param r The stream being read from. * @throws XMLStreamException */ private void skipTag(XmlReader r) throws Exception { int et = r.getEventType(); if (et != START_ELEMENT) throw new ParseException(this, "skipToNextTag() call on invalid event ''{0}''. Must only be called on START_ELEMENT events.", XmlUtils.toReadableEvent(r) ); String n = r.getLocalName(); int depth = 0; while (true) { et = r.next(); if (et == START_ELEMENT) { String n2 = r.getLocalName(); if (n.equals(n2)) depth++; } else if (et == END_ELEMENT) { String n2 = r.getLocalName(); if (n.equals(n2)) depth--; if (depth < 0) return; } } } private void skipTag(XmlReader r, HtmlTag...expected) throws Exception { HtmlTag tag = HtmlTag.forEvent(this, r); if (tag.isOneOf(expected)) r.next(); else throw new ParseException(this, "Unexpected tag: ''{0}''. Expected one of the following: {1}", tag, expected); } private static int skipWs(XmlReader r) throws XMLStreamException { int event = r.getEventType(); while (event != START_ELEMENT && event != END_ELEMENT && event != END_DOCUMENT && r.isWhiteSpace()) event = r.next(); return event; } /** * Parses CHARACTERS data. * *

* Precondition: Pointing to event immediately following opening tag. * Postcondition: Pointing to closing tag. * * @param r The stream being read from. * @return The parsed string. * @throws XMLStreamException */ @Override /* XmlParserSession */ protected final String parseText(XmlReader r) throws Exception { StringBuilder sb = getStringBuilder(); int et = r.getEventType(); if (et == END_ELEMENT) return ""; int depth = 0; String characters = null; while (true) { if (et == START_ELEMENT) { if (characters != null) { if (sb.length() == 0) characters = trimStart(characters); sb.append(characters); characters = null; } HtmlTag tag = HtmlTag.forEvent(this, r); if (tag == BR) { sb.append('\n'); r.nextTag(); } else if (tag == BS) { sb.append('\b'); r.nextTag(); } else if (tag == SP) { et = r.next(); if (et == CHARACTERS) { String s = r.getText(); if (s.length() > 0) { char c = r.getText().charAt(0); if (c == '\u2003') c = '\t'; sb.append(c); } r.nextTag(); } } else if (tag == FF) { sb.append('\f'); r.nextTag(); } else if (tag.isOneOf(STRING, NUMBER, BOOLEAN)) { et = r.next(); if (et == CHARACTERS) { sb.append(r.getText()); r.nextTag(); } } else { sb.append('<').append(r.getLocalName()); for (int i = 0; i < r.getAttributeCount(); i++) sb.append(' ').append(r.getAttributeName(i)).append('=').append('\'').append(r.getAttributeValue(i)).append('\''); sb.append('>'); depth++; } } else if (et == END_ELEMENT) { if (characters != null) { if (sb.length() == 0) characters = trimStart(characters); if (depth == 0) characters = trimEnd(characters); sb.append(characters); characters = null; } if (depth == 0) break; sb.append('<').append(r.getLocalName()).append('>'); depth--; } else if (et == CHARACTERS) { characters = r.getText(); } et = r.next(); } String s = trim(sb.toString()); returnStringBuilder(sb); return s; } /** * Identical to {@link #parseText(XmlReader)} except assumes the current event is the opening tag. * *

* Precondition: Pointing to opening tag. * Postcondition: Pointing to closing tag. * * @param r The stream being read from. * @return The parsed string. * @throws XMLStreamException */ @Override /* XmlParserSession */ protected final String getElementText(XmlReader r) throws Exception { r.next(); return parseText(r); } @Override /* XmlParserSession */ protected final boolean isWhitespaceElement(XmlReader r) { String s = r.getLocalName(); return whitespaceElements.contains(s); } @Override /* XmlParserSession */ protected final String parseWhitespaceElement(XmlReader r) throws Exception { HtmlTag tag = HtmlTag.forEvent(this, r); int et = r.next(); if (tag == BR) { return "\n"; } else if (tag == BS) { return "\b"; } else if (tag == FF) { return "\f"; } else if (tag == SP) { if (et == CHARACTERS) { String s = r.getText(); if (s.charAt(0) == '\u2003') s = "\t"; r.next(); return decodeString(s); } return ""; } else { throw new ParseException(this, "Invalid tag found in parseWhitespaceElement(): ''{0}''", tag); } } }