Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.TextContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
class HtmlHandler extends TextContentHandler {
// List of attributes that need to be resolved.
private static final Set URI_ATTRIBUTES =
new HashSet(Arrays.asList("src", "href", "longdesc", "cite"));
private static final Pattern ICBM =
Pattern.compile("\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*");
private final HtmlMapper mapper;
private final XHTMLContentHandler xhtml;
private final Metadata metadata;
private final StringBuilder title = new StringBuilder();
private int bodyLevel = 0;
private int discardLevel = 0;
private int titleLevel = 0;
private boolean isTitleSetToMetadata = false;
private HtmlHandler(
HtmlMapper mapper, XHTMLContentHandler xhtml, Metadata metadata) {
super(xhtml);
this.mapper = mapper;
this.xhtml = xhtml;
this.metadata = metadata;
// Try to determine the default base URL, if one has not been given
if (metadata.get(Metadata.CONTENT_LOCATION) == null) {
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name != null) {
name = name.trim();
try {
new URL(name); // test URL format
metadata.set(Metadata.CONTENT_LOCATION, name);
} catch (MalformedURLException e) {
// The resource name is not a valid URL, ignore it
}
}
}
}
public HtmlHandler(
HtmlMapper mapper, ContentHandler handler, Metadata metadata) {
this(mapper, new XHTMLContentHandler(handler, metadata), metadata);
}
@Override
public void startElement(
String uri, String local, String name, Attributes atts)
throws SAXException {
if ("TITLE".equals(name) || titleLevel > 0) {
titleLevel++;
}
if ("BODY".equals(name) || ("FRAMESET".equals(name)) || bodyLevel > 0) {
bodyLevel++;
}
if (mapper.isDiscardElement(name) || discardLevel > 0) {
discardLevel++;
}
if (bodyLevel == 0 && discardLevel == 0) {
if ("META".equals(name) && atts.getValue("content") != null) {
// TIKA-478: For cases where we have either a name or
// "http-equiv", assume that XHTMLContentHandler will emit
// these in the , thus passing them through safely.
if (atts.getValue("http-equiv") != null) {
addHtmlMetadata(
atts.getValue("http-equiv"),
atts.getValue("content"));
} else if (atts.getValue("name") != null) {
// Record the meta tag in the metadata
addHtmlMetadata(
atts.getValue("name"),
atts.getValue("content"));
} else if (atts.getValue("property") != null) {
// TIKA-983: Handle tags
metadata.add(
atts.getValue("property"),
atts.getValue("content"));
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
startElementWithSafeAttributes("base", atts);
xhtml.endElement("base");
metadata.set(
Metadata.CONTENT_LOCATION,
resolve(atts.getValue("href")));
} else if ("LINK".equals(name)) {
startElementWithSafeAttributes("link", atts);
xhtml.endElement("link");
} else if ("SCRIPT".equals(name) && atts.getValue("src") != null) {
startElementWithSafeAttributes("script", atts);
xhtml.endElement("script");
}
}
if (bodyLevel > 0 && discardLevel == 0) {
String safe = mapper.mapSafeElement(name);
if (safe != null) {
startElementWithSafeAttributes(safe, atts);
}
}
title.setLength(0);
}
/**
* Adds a metadata setting from the HTML to the Tika metadata
* object. The name and value are normalized where possible.
*/
private void addHtmlMetadata(String name, String value) {
if (name == null || value == null) {
// ignore
} else if (name.equalsIgnoreCase("ICBM")) {
Matcher m = ICBM.matcher(value);
if (m.matches()) {
metadata.set("ICBM", m.group(1) + ", " + m.group(2));
metadata.set(Metadata.LATITUDE, m.group(1));
metadata.set(Metadata.LONGITUDE, m.group(2));
} else {
metadata.set("ICBM", value);
}
} else if (name.equalsIgnoreCase(Metadata.CONTENT_TYPE)) {
//don't overwrite Metadata.CONTENT_TYPE!
MediaType type = MediaType.parse(value);
if (type != null) {
metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, type.toString());
} else {
metadata.set(TikaCoreProperties.CONTENT_TYPE_HINT, value);
}
} else {
metadata.add(name, value);
}
}
private void startElementWithSafeAttributes(String name, Attributes atts) throws SAXException {
if (atts.getLength() == 0) {
xhtml.startElement(name);
return;
}
boolean isObject = name.equals("object");
String codebase = null;
if (isObject) {
codebase = atts.getValue("", "codebase");
if (codebase != null) {
codebase = resolve(codebase);
} else {
codebase = metadata.get(Metadata.CONTENT_LOCATION);
}
}
AttributesImpl newAttributes = new AttributesImpl(atts);
for (int att = 0; att < newAttributes.getLength(); att++) {
String attrName = newAttributes.getLocalName(att);
String normAttrName = mapper.mapSafeAttribute(name, attrName);
if (normAttrName == null) {
newAttributes.removeAttribute(att);
att--;
} else {
// We have a remapped attribute name, so set it as it might have changed.
newAttributes.setLocalName(att, normAttrName);
// And resolve relative links. Eventually this should be pushed
// into the HtmlMapper code.
if (URI_ATTRIBUTES.contains(normAttrName)) {
newAttributes.setValue(att, resolve(newAttributes.getValue(att)));
} else if (isObject && "codebase".equals(normAttrName)) {
newAttributes.setValue(att, codebase);
} else if (isObject
&& ("data".equals(normAttrName)
|| "classid".equals(normAttrName))) {
newAttributes.setValue(
att,
resolve(codebase, newAttributes.getValue(att)));
}
}
}
if ("img".equals(name) && newAttributes.getValue("", "alt") == null) {
newAttributes.addAttribute("", "alt", "alt", "CDATA", "");
}
xhtml.startElement(name, newAttributes);
}
@Override
public void endElement(
String uri, String local, String name) throws SAXException {
if (bodyLevel > 0 && discardLevel == 0) {
String safe = mapper.mapSafeElement(name);
if (safe != null) {
xhtml.endElement(safe);
} else if (XHTMLContentHandler.ENDLINE.contains(
name.toLowerCase(Locale.ENGLISH))) {
// TIKA-343: Replace closing block tags (and ) with a
// newline unless the HtmlMapper above has already mapped
// them to something else
xhtml.newline();
}
}
if (titleLevel > 0) {
titleLevel--;
if (titleLevel == 0 && !isTitleSetToMetadata) {
metadata.set(TikaCoreProperties.TITLE, title.toString().trim());
isTitleSetToMetadata = true;
}
}
if (bodyLevel > 0) {
bodyLevel--;
}
if (discardLevel > 0) {
discardLevel--;
}
}
@Override
public void characters(char[] ch, int start, int length)
throws SAXException {
if (titleLevel > 0 && bodyLevel == 0) {
title.append(ch, start, length);
}
if (bodyLevel > 0 && discardLevel == 0) {
super.characters(ch, start, length);
}
}
@Override
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
if (bodyLevel > 0 && discardLevel == 0) {
super.ignorableWhitespace(ch, start, length);
}
}
private String resolve(String url) {
return resolve(metadata.get(Metadata.CONTENT_LOCATION), url);
}
private String resolve(String base, String url) {
url = url.trim();
// Return the URL as-is if no base URL is available or if the URL
// matches a common non-hierarchical or pseudo URI prefix
String lower = url.toLowerCase(Locale.ENGLISH);
if (base == null
|| lower.startsWith("urn:")
|| lower.startsWith("mailto:")
|| lower.startsWith("tel:")
|| lower.startsWith("data:")
|| lower.startsWith("javascript:")
|| lower.startsWith("about:")) {
return url;
}
try {
URL baseURL = new URL(base.trim());
// We need to handle one special case, where the relativeUrl is
// just a query string (like "?pid=1"), and the baseUrl doesn't
// end with a '/'. In that case, the URL class removes the last
// portion of the path, which we don't want.
String path = baseURL.getPath();
if (url.startsWith("?") && path.length() > 0 && !path.endsWith("/")) {
return new URL(
baseURL.getProtocol(),
baseURL.getHost(), baseURL.getPort(),
baseURL.getPath() + url).toExternalForm();
} else {
return new URL(baseURL, url).toExternalForm();
}
} catch (MalformedURLException e) {
// Unknown or broken format; just return the URL as received.
return url;
}
}
}