Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.w3c.dom.Document;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A reader for XML files compliant with the freedesktop MIME-info DTD.
*
*
* <!DOCTYPE mime-info [
* <!ELEMENT mime-info (mime-type)+>
* <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info">
*
* <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*>
* <!ATTLIST mime-type type CDATA #REQUIRED>
*
* <!-- a comment describing a document with the respective MIME type. Example: "WMV video" -->
* <!ELEMENT _comment (#PCDATA)>
* <!ATTLIST _comment xml:lang CDATA #IMPLIED>
*
* <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" -->
* <!ELEMENT acronym (#PCDATA)>
* <!ATTLIST acronym xml:lang CDATA #IMPLIED>
*
* <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" -->
* <!ELEMENT expanded-acronym (#PCDATA)>
* <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED>
*
* <!ELEMENT glob EMPTY>
* <!ATTLIST glob pattern CDATA #REQUIRED>
* <!ATTLIST glob isregex CDATA #IMPLIED>
*
* <!ELEMENT magic (match)+>
* <!ATTLIST magic priority CDATA #IMPLIED>
*
* <!ELEMENT match (match)*>
* <!ATTLIST match offset CDATA #REQUIRED>
* <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED>
* <!ATTLIST match value CDATA #REQUIRED>
* <!ATTLIST match mask CDATA #IMPLIED>
*
* <!ELEMENT root-XML EMPTY>
* <!ATTLIST root-XML
* namespaceURI CDATA #REQUIRED
* localName CDATA #REQUIRED>
*
* <!ELEMENT alias EMPTY>
* <!ATTLIST alias
* type CDATA #REQUIRED>
*
* <!ELEMENT sub-class-of EMPTY>
* <!ATTLIST sub-class-of
* type CDATA #REQUIRED>
* ]>
*
*
* In addition to the standard fields, this will also read two Tika specific fields:
* - link
* - uti
*
*
* @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
*/
public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
protected final MimeTypes types;
/** Current type */
protected MimeType type = null;
protected int priority;
protected StringBuilder characters = null;
protected MimeTypesReader(MimeTypes types) {
this.types = types;
}
public void read(InputStream stream) throws IOException, MimeTypeException {
try {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(false);
factory.setFeature(
XMLConstants.FEATURE_SECURE_PROCESSING, true);
SAXParser parser = factory.newSAXParser();
parser.parse(stream, this);
} catch (ParserConfigurationException e) {
throw new MimeTypeException("Unable to create an XML parser", e);
} catch (SAXException e) {
throw new MimeTypeException("Invalid type configuration", e);
}
}
public void read(Document document) throws MimeTypeException {
try {
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer();
transformer.transform(new DOMSource(document), new SAXResult(this));
} catch (TransformerException e) {
throw new MimeTypeException("Failed to parse type registry", e);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return new InputSource(new ByteArrayInputStream(new byte[0]));
}
@Override
public void startElement(
String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (type == null) {
if (MIME_TYPE_TAG.equals(qName)) {
String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
try {
type = types.forName(name);
} catch (MimeTypeException e) {
handleMimeError(name, e, qName, attributes);
}
}
} else if (ALIAS_TAG.equals(qName)) {
String alias = attributes.getValue(ALIAS_TYPE_ATTR);
types.addAlias(type, MediaType.parse(alias));
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
} else if (ACRONYM_TAG.equals(qName)||
COMMENT_TAG.equals(qName)||
TIKA_LINK_TAG.equals(qName)||
TIKA_UTI_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
String isRegex = attributes.getValue(ISREGEX_ATTR);
if (pattern != null) {
try {
types.addPattern(type, pattern, Boolean.valueOf(isRegex));
} catch (MimeTypeException e) {
handleGlobError(type, pattern, e, qName, attributes);
}
}
} else if (ROOT_XML_TAG.equals(qName)) {
String namespace = attributes.getValue(NS_URI_ATTR);
String name = attributes.getValue(LOCAL_NAME_ATTR);
type.addRootXML(namespace, name);
} else if (MATCH_TAG.equals(qName)) {
String kind = attributes.getValue(MATCH_TYPE_ATTR);
String offset = attributes.getValue(MATCH_OFFSET_ATTR);
String value = attributes.getValue(MATCH_VALUE_ATTR);
String mask = attributes.getValue(MATCH_MASK_ATTR);
if (kind == null) {
kind = "string";
}
current = new ClauseRecord(
new MagicMatch(type.getType(), kind, offset, value, mask));
} else if (MAGIC_TAG.equals(qName)) {
String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
if (value != null && value.length() > 0) {
priority = Integer.parseInt(value);
} else {
priority = 50;
}
current = new ClauseRecord(null);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (type != null) {
if (MIME_TYPE_TAG.equals(qName)) {
type = null;
} else if (COMMENT_TAG.equals(qName)) {
type.setDescription(characters.toString().trim());
characters = null;
} else if (ACRONYM_TAG.equals(qName)) {
type.setAcronym(characters.toString().trim());
characters = null;
} else if (TIKA_UTI_TAG.equals(qName)) {
type.setUniformTypeIdentifier(characters.toString().trim());
characters = null;
} else if (TIKA_LINK_TAG.equals(qName)) {
try {
type.addLink(new URI(characters.toString().trim()));
}
catch (URISyntaxException e) {
throw new IllegalArgumentException("unable to parse link: "+characters, e);
}
characters = null;
} else if (MATCH_TAG.equals(qName)) {
current.stop();
} else if (MAGIC_TAG.equals(qName)) {
for (Clause clause : current.getClauses()) {
type.addMagic(new Magic(type, priority, clause));
}
current = null;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (characters != null) {
characters.append(ch, start, length);
}
}
protected void handleMimeError(String input, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
throw new SAXException(ex);
}
protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
throw new SAXException(ex);
}
private ClauseRecord current = new ClauseRecord(null);
private class ClauseRecord {
private ClauseRecord parent;
private Clause clause;
private List subclauses = null;
public ClauseRecord(Clause clause) {
this.parent = current;
this.clause = clause;
}
public void stop() {
if (subclauses != null) {
Clause subclause;
if (subclauses.size() == 1) {
subclause = subclauses.get(0);
} else {
subclause = new OrClause(subclauses);
}
clause = new AndClause(clause, subclause);
}
if (parent.subclauses == null) {
parent.subclauses = Collections.singletonList(clause);
} else {
if (parent.subclauses.size() == 1) {
parent.subclauses = new ArrayList(parent.subclauses);
}
parent.subclauses.add(clause);
}
current = current.parent;
}
public List getClauses() {
return subclauses;
}
}
}