All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.parser.HTMLFactory Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
/*
 * DSI utilities
 *
 * Copyright (C) 2005-2020 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

package it.unimi.dsi.parser;

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.lang.MutableString;

/** A parsing factory for (X)HTML.
 *
 * 

Warning: for maximum flexibility, the methods of this factory * do not perform case normalisation. If you are parsing HTML, you are invited * to downcase your names before accessing {@link #getElement(MutableString)} * and {@link #getAttribute(MutableString)}. * *

This class is a singleton, and its only instance is accessible using the public field * {@link #INSTANCE}. * *

The relationship between this class and {@link Element}/{@link Attribute} is a bit * twisted due to the need to accomodate two features: *

    *
  • (X)HTML interned objects must be accessible directly (see, e.g., {@link Element#A}); *
  • (X)HTML interned objects must be put into suitable name-to-object maps. *
* *

To this purpose, this class exports packagewise some static factory methods that create {@link Element}s and * {@link Attribute}s and register them locally. The static initialisation code in * {@link Element} and {@link Attribute} creates elements such as {@link Element#A} using the abovementioned * factory methods. * *

An alternative implementation could use reflection, but I don't see great advantages. */ public class HTMLFactory implements ParsingFactory { private HTMLFactory() {} public static final HTMLFactory INSTANCE = new HTMLFactory(); @Override public Element getElement(final MutableString name) { return NAME2ELEMENT.get(name); } @Override public Attribute getAttribute(final MutableString name) { return NAME2ATTRIBUTE.get(name); } @Override public Entity getEntity(final MutableString name) { return NAME2ENTITY.get(name); } /** A (quick) map from entity names to entites. */ static final Object2ObjectOpenHashMap NAME2ENTITY = new Object2ObjectOpenHashMap<>(Hash.DEFAULT_INITIAL_SIZE, .5f); /** A (quick) map from attribute names to attributes. */ static final Object2ObjectOpenHashMap NAME2ATTRIBUTE = new Object2ObjectOpenHashMap<>(Hash.DEFAULT_INITIAL_SIZE, .5f); /** A (quick) map from element-type names to element types. */ static final Object2ObjectOpenHashMap NAME2ELEMENT = new Object2ObjectOpenHashMap<>(Hash.DEFAULT_INITIAL_SIZE, .5f); static Element newElement(final CharSequence name) { final Element element = new Element(name); NAME2ELEMENT.put(element.name, element); return element; } static Element newElement(final CharSequence name, final boolean breaksFlow, final boolean isSimple) { final Element element = new Element(name, breaksFlow, isSimple); NAME2ELEMENT.put(element.name, element); return element; } static Element newElement(final CharSequence name, final boolean breaksFlow, final boolean isSimple, final boolean isImplicit) { final Element element = new Element(name, breaksFlow, isSimple, isImplicit); NAME2ELEMENT.put(element.name, element); return element; } static Attribute newAttribute(final CharSequence name) { final Attribute attribute = new Attribute(name); NAME2ATTRIBUTE.put(attribute.name, attribute); return attribute; } static Entity newEntity(final CharSequence name, final char c) { final Entity entity = new Entity(name, c); NAME2ENTITY.put(entity.name, entity); return entity; } static { NAME2ATTRIBUTE.defaultReturnValue(Attribute.UNKNOWN); NAME2ELEMENT.defaultReturnValue(Element.UNKNOWN); // --- Entity Names ----------------------------------- // Latin 1 HTMLFactory.newEntity("nbsp", (char)160); HTMLFactory.newEntity("iexcl", (char)161); HTMLFactory.newEntity("cent", (char)162); HTMLFactory.newEntity("pound", (char)163); HTMLFactory.newEntity("curren", (char)164); HTMLFactory.newEntity("yen", (char)165); HTMLFactory.newEntity("brvbar", (char)166); HTMLFactory.newEntity("sect", (char)167); HTMLFactory.newEntity("uml", (char)168); HTMLFactory.newEntity("copy", (char)169); HTMLFactory.newEntity("ordf", (char)170); HTMLFactory.newEntity("laquo", (char)171); HTMLFactory.newEntity("not", (char)172); HTMLFactory.newEntity("shy", (char)173); HTMLFactory.newEntity("reg", (char)174); HTMLFactory.newEntity("macr", (char)175); HTMLFactory.newEntity("deg", (char)176); HTMLFactory.newEntity("plusmn", (char)177); HTMLFactory.newEntity("sup2", (char)178); HTMLFactory.newEntity("sup3", (char)179); HTMLFactory.newEntity("acute", (char)180); HTMLFactory.newEntity("micro", (char)181); HTMLFactory.newEntity("para", (char)182); HTMLFactory.newEntity("middot", (char)183); HTMLFactory.newEntity("cedil", (char)184); HTMLFactory.newEntity("sup1", (char)185); HTMLFactory.newEntity("ordm", (char)186); HTMLFactory.newEntity("raquo", (char)187); HTMLFactory.newEntity("frac14", (char)188); HTMLFactory.newEntity("frac12", (char)189); HTMLFactory.newEntity("frac34", (char)190); HTMLFactory.newEntity("iquest", (char)191); HTMLFactory.newEntity("Agrave", (char)192); HTMLFactory.newEntity("Aacute", (char)193); HTMLFactory.newEntity("Acirc", (char)194); HTMLFactory.newEntity("Atilde", (char)195); HTMLFactory.newEntity("Auml", (char)196); HTMLFactory.newEntity("Aring", (char)197); HTMLFactory.newEntity("AElig", (char)198); HTMLFactory.newEntity("Ccedil", (char)199); HTMLFactory.newEntity("Egrave", (char)200); HTMLFactory.newEntity("Eacute", (char)201); HTMLFactory.newEntity("Ecirc", (char)202); HTMLFactory.newEntity("Euml", (char)203); HTMLFactory.newEntity("Igrave", (char)204); HTMLFactory.newEntity("Iacute", (char)205); HTMLFactory.newEntity("Icirc", (char)206); HTMLFactory.newEntity("Iuml", (char)207); HTMLFactory.newEntity("ETH", (char)208); HTMLFactory.newEntity("Ntilde", (char)209); HTMLFactory.newEntity("Ograve", (char)210); HTMLFactory.newEntity("Oacute", (char)211); HTMLFactory.newEntity("Ocirc", (char)212); HTMLFactory.newEntity("Otilde", (char)213); HTMLFactory.newEntity("Ouml", (char)214); HTMLFactory.newEntity("times", (char)215); HTMLFactory.newEntity("Oslash", (char)216); HTMLFactory.newEntity("Ugrave", (char)217); HTMLFactory.newEntity("Uacute", (char)218); HTMLFactory.newEntity("Ucirc", (char)219); HTMLFactory.newEntity("Uuml", (char)220); HTMLFactory.newEntity("Yacute", (char)221); HTMLFactory.newEntity("THORN", (char)222); HTMLFactory.newEntity("szlig", (char)223); HTMLFactory.newEntity("agrave", (char)224); HTMLFactory.newEntity("aacute", (char)225); HTMLFactory.newEntity("acirc", (char)226); HTMLFactory.newEntity("atilde", (char)227); HTMLFactory.newEntity("auml", (char)228); HTMLFactory.newEntity("aring", (char)229); HTMLFactory.newEntity("aelig", (char)230); HTMLFactory.newEntity("ccedil", (char)231); HTMLFactory.newEntity("egrave", (char)232); HTMLFactory.newEntity("eacute", (char)233); HTMLFactory.newEntity("ecirc", (char)234); HTMLFactory.newEntity("euml", (char)235); HTMLFactory.newEntity("igrave", (char)236); HTMLFactory.newEntity("iacute", (char)237); HTMLFactory.newEntity("icirc", (char)238); HTMLFactory.newEntity("iuml", (char)239); HTMLFactory.newEntity("eth", (char)240); HTMLFactory.newEntity("ntilde", (char)241); HTMLFactory.newEntity("ograve", (char)242); HTMLFactory.newEntity("oacute", (char)243); HTMLFactory.newEntity("ocirc", (char)244); HTMLFactory.newEntity("otilde", (char)245); HTMLFactory.newEntity("ouml", (char)246); HTMLFactory.newEntity("divide", (char)247); HTMLFactory.newEntity("oslash", (char)248); HTMLFactory.newEntity("ugrave", (char)249); HTMLFactory.newEntity("uacute", (char)250); HTMLFactory.newEntity("ucirc", (char)251); HTMLFactory.newEntity("uuml", (char)252); HTMLFactory.newEntity("yacute", (char)253); HTMLFactory.newEntity("thorn", (char)254); HTMLFactory.newEntity("yuml", (char)255); // Special HTMLFactory.newEntity("quot", (char)34); HTMLFactory.newEntity("apos", (char)39); HTMLFactory.newEntity("amp", (char)38); HTMLFactory.newEntity("lt", (char)60); HTMLFactory.newEntity("gt", (char)62); HTMLFactory.newEntity("OElig", (char)338); HTMLFactory.newEntity("oelig", (char)339); HTMLFactory.newEntity("Scaron", (char)352); HTMLFactory.newEntity("scaron", (char)353); HTMLFactory.newEntity("Yuml", (char)376); HTMLFactory.newEntity("circ", (char)710); HTMLFactory.newEntity("tilde", (char)732); HTMLFactory.newEntity("ensp", (char)8194); HTMLFactory.newEntity("emsp", (char)8195); HTMLFactory.newEntity("thinsp", (char)8201); HTMLFactory.newEntity("zwnj", (char)8204); HTMLFactory.newEntity("zwj", (char)8205); HTMLFactory.newEntity("lrm", (char)8206); HTMLFactory.newEntity("rlm", (char)8207); HTMLFactory.newEntity("ndash", (char)8211); HTMLFactory.newEntity("mdash", (char)8212); HTMLFactory.newEntity("lsquo", (char)8216); HTMLFactory.newEntity("rsquo", (char)8217); HTMLFactory.newEntity("sbquo", (char)8218); HTMLFactory.newEntity("ldquo", (char)8220); HTMLFactory.newEntity("rdquo", (char)8221); HTMLFactory.newEntity("bdquo", (char)8222); HTMLFactory.newEntity("dagger", (char)8224); HTMLFactory.newEntity("Dagger", (char)8225); HTMLFactory.newEntity("permil", (char)8240); HTMLFactory.newEntity("lsaquo", (char)8249); HTMLFactory.newEntity("rsaquo", (char)8250); HTMLFactory.newEntity("euro", (char)8364); // Symbols HTMLFactory.newEntity("fnof", (char)402); HTMLFactory.newEntity("Alpha", (char)913); HTMLFactory.newEntity("Beta", (char)914); HTMLFactory.newEntity("Gamma", (char)915); HTMLFactory.newEntity("Delta", (char)916); HTMLFactory.newEntity("Epsilon", (char)917); HTMLFactory.newEntity("Zeta", (char)918); HTMLFactory.newEntity("Eta", (char)919); HTMLFactory.newEntity("Theta", (char)920); HTMLFactory.newEntity("Iota", (char)921); HTMLFactory.newEntity("Kappa", (char)922); HTMLFactory.newEntity("Lambda", (char)923); HTMLFactory.newEntity("Mu", (char)924); HTMLFactory.newEntity("Nu", (char)925); HTMLFactory.newEntity("Xi", (char)926); HTMLFactory.newEntity("Omicron", (char)927); HTMLFactory.newEntity("Pi", (char)928); HTMLFactory.newEntity("Rho", (char)929); HTMLFactory.newEntity("Sigma", (char)931); HTMLFactory.newEntity("Tau", (char)932); HTMLFactory.newEntity("Upsilon", (char)933); HTMLFactory.newEntity("Phi", (char)934); HTMLFactory.newEntity("Chi", (char)935); HTMLFactory.newEntity("Psi", (char)936); HTMLFactory.newEntity("Omega", (char)937); HTMLFactory.newEntity("alpha", (char)945); HTMLFactory.newEntity("beta", (char)946); HTMLFactory.newEntity("gamma", (char)947); HTMLFactory.newEntity("delta", (char)948); HTMLFactory.newEntity("epsilon", (char)949); HTMLFactory.newEntity("zeta", (char)950); HTMLFactory.newEntity("eta", (char)951); HTMLFactory.newEntity("theta", (char)952); HTMLFactory.newEntity("iota", (char)953); HTMLFactory.newEntity("kappa", (char)954); HTMLFactory.newEntity("lambda", (char)955); HTMLFactory.newEntity("mu", (char)956); HTMLFactory.newEntity("nu", (char)957); HTMLFactory.newEntity("xi", (char)958); HTMLFactory.newEntity("omicron", (char)959); HTMLFactory.newEntity("pi", (char)960); HTMLFactory.newEntity("rho", (char)961); HTMLFactory.newEntity("sigmaf", (char)962); HTMLFactory.newEntity("sigma", (char)963); HTMLFactory.newEntity("tau", (char)964); HTMLFactory.newEntity("upsilon", (char)965); HTMLFactory.newEntity("phi", (char)966); HTMLFactory.newEntity("chi", (char)967); HTMLFactory.newEntity("psi", (char)968); HTMLFactory.newEntity("omega", (char)969); HTMLFactory.newEntity("thetasym", (char)977); HTMLFactory.newEntity("upsih", (char)978); HTMLFactory.newEntity("piv", (char)982); HTMLFactory.newEntity("bull", (char)8226); HTMLFactory.newEntity("hellip", (char)8230); HTMLFactory.newEntity("prime", (char)8242); HTMLFactory.newEntity("Prime", (char)8243); HTMLFactory.newEntity("oline", (char)8254); HTMLFactory.newEntity("frasl", (char)8260); HTMLFactory.newEntity("weierp", (char)8472); HTMLFactory.newEntity("image", (char)8465); HTMLFactory.newEntity("real", (char)8476); HTMLFactory.newEntity("trade", (char)8482); HTMLFactory.newEntity("alefsym", (char)8501); HTMLFactory.newEntity("larr", (char)8592); HTMLFactory.newEntity("uarr", (char)8593); HTMLFactory.newEntity("rarr", (char)8594); HTMLFactory.newEntity("darr", (char)8595); HTMLFactory.newEntity("harr", (char)8596); HTMLFactory.newEntity("crarr", (char)8629); HTMLFactory.newEntity("lArr", (char)8656); HTMLFactory.newEntity("uArr", (char)8657); HTMLFactory.newEntity("rArr", (char)8658); HTMLFactory.newEntity("dArr", (char)8659); HTMLFactory.newEntity("hArr", (char)8660); HTMLFactory.newEntity("forall", (char)8704); HTMLFactory.newEntity("part", (char)8706); HTMLFactory.newEntity("exist", (char)8707); HTMLFactory.newEntity("empty", (char)8709); HTMLFactory.newEntity("nabla", (char)8711); HTMLFactory.newEntity("isin", (char)8712); HTMLFactory.newEntity("notin", (char)8713); HTMLFactory.newEntity("ni", (char)8715); HTMLFactory.newEntity("prod", (char)8719); HTMLFactory.newEntity("sum", (char)8721); HTMLFactory.newEntity("minus", (char)8722); HTMLFactory.newEntity("lowast", (char)8727); HTMLFactory.newEntity("radic", (char)8730); HTMLFactory.newEntity("prop", (char)8733); HTMLFactory.newEntity("infin", (char)8734); HTMLFactory.newEntity("ang", (char)8736); HTMLFactory.newEntity("and", (char)8743); HTMLFactory.newEntity("or", (char)8744); HTMLFactory.newEntity("cap", (char)8745); HTMLFactory.newEntity("cup", (char)8746); HTMLFactory.newEntity("int", (char)8747); HTMLFactory.newEntity("there4", (char)8756); HTMLFactory.newEntity("sim", (char)8764); HTMLFactory.newEntity("cong", (char)8773); HTMLFactory.newEntity("asymp", (char)8776); HTMLFactory.newEntity("ne", (char)8800); HTMLFactory.newEntity("equiv", (char)8801); HTMLFactory.newEntity("le", (char)8804); HTMLFactory.newEntity("ge", (char)8805); HTMLFactory.newEntity("sub", (char)8834); HTMLFactory.newEntity("sup", (char)8835); HTMLFactory.newEntity("nsub", (char)8836); HTMLFactory.newEntity("sube", (char)8838); HTMLFactory.newEntity("supe", (char)8839); HTMLFactory.newEntity("oplus", (char)8853); HTMLFactory.newEntity("otimes", (char)8855); HTMLFactory.newEntity("perp", (char)8869); HTMLFactory.newEntity("sdot", (char)8901); HTMLFactory.newEntity("lceil", (char)8968); HTMLFactory.newEntity("rceil", (char)8969); HTMLFactory.newEntity("lfloor", (char)8970); HTMLFactory.newEntity("rfloor", (char)8971); HTMLFactory.newEntity("lang", (char)9001); HTMLFactory.newEntity("rang", (char)9002); HTMLFactory.newEntity("loz", (char)9674); HTMLFactory.newEntity("spades", (char)9824); HTMLFactory.newEntity("clubs", (char)9827); HTMLFactory.newEntity("hearts", (char)9829); HTMLFactory.newEntity("diams", (char)9830); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy