Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.mime;
import javax.xml.XMLConstants;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.tika.exception.TikaException;
import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* A reader for XML files compliant with the freedesktop MIME-info DTD.
*
*
* <!DOCTYPE mime-info [
* <!ELEMENT mime-info (mime-type)+>
* <!ATTLIST mime-info xmlns CDATA #FIXED "http://www.freedesktop.org/standards/shared-mime-info">
*
* <!ELEMENT mime-type (comment|acronym|expanded-acronym|glob|magic|root-XML|alias|sub-class-of)*>
* <!ATTLIST mime-type type CDATA #REQUIRED>
*
* <!-- a comment describing a document with the respective MIME type. Example: "WMV video" -->
* <!ELEMENT _comment (#PCDATA)>
* <!ATTLIST _comment xml:lang CDATA #IMPLIED>
*
* <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "WMV" -->
* <!ELEMENT acronym (#PCDATA)>
* <!ATTLIST acronym xml:lang CDATA #IMPLIED>
*
* <!-- a comment describing a the respective unexpanded MIME type acronym. Example: "Windows Media Video" -->
* <!ELEMENT expanded-acronym (#PCDATA)>
* <!ATTLIST expanded-acronym xml:lang CDATA #IMPLIED>
*
* <!ELEMENT glob EMPTY>
* <!ATTLIST glob pattern CDATA #REQUIRED>
* <!ATTLIST glob isregex CDATA #IMPLIED>
*
* <!ELEMENT magic (match)+>
* <!ATTLIST magic priority CDATA #IMPLIED>
*
* <!ELEMENT match (match)*>
* <!ATTLIST match offset CDATA #REQUIRED>
* <!ATTLIST match type (string|big16|big32|little16|little32|host16|host32|byte) #REQUIRED>
* <!ATTLIST match value CDATA #REQUIRED>
* <!ATTLIST match mask CDATA #IMPLIED>
*
* <!ELEMENT root-XML EMPTY>
* <!ATTLIST root-XML
* namespaceURI CDATA #REQUIRED
* localName CDATA #REQUIRED>
*
* <!ELEMENT alias EMPTY>
* <!ATTLIST alias
* type CDATA #REQUIRED>
*
* <!ELEMENT sub-class-of EMPTY>
* <!ATTLIST sub-class-of
* type CDATA #REQUIRED>
* ]>
*
*
* In addition to the standard fields, this will also read two Tika specific fields:
* - link
* - uti
*
*
* @see http://freedesktop.org/wiki/Standards_2fshared_2dmime_2dinfo_2dspec
*/
public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
/**
* Parser pool size
*/
private static int POOL_SIZE = 10;
private static final ReentrantReadWriteLock READ_WRITE_LOCK = new ReentrantReadWriteLock();
private static ArrayBlockingQueue SAX_PARSERS = new ArrayBlockingQueue<>(POOL_SIZE);
private static Logger LOG = Logger.getLogger(MimeTypesReader.class.getName());
static {
try {
setPoolSize(POOL_SIZE);
} catch (TikaException e) {
throw new RuntimeException("problem initializing SAXParser pool", e);
}
}
protected final MimeTypes types;
/** Current type */
protected MimeType type = null;
protected int priority;
protected StringBuilder characters = null;
protected MimeTypesReader(MimeTypes types) {
this.types = types;
}
public void read(InputStream stream) throws IOException, MimeTypeException {
SAXParser parser = null;
try {
parser = acquireSAXParser();
parser.parse(stream, this);
} catch (TikaException e) {
throw new MimeTypeException("Unable to create an XML parser", e);
} catch (SAXException e) {
throw new MimeTypeException("Invalid type configuration", e);
} finally {
if (parser != null) {
releaseParser(parser);
}
}
}
public void read(Document document) throws MimeTypeException {
try {
Transformer transformer = XMLReaderUtils.getTransformer();
transformer.transform(new DOMSource(document), new SAXResult(this));
} catch (TransformerException | TikaException e) {
throw new MimeTypeException("Failed to parse type registry", e);
}
}
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return new InputSource(new ByteArrayInputStream(new byte[0]));
}
@Override
public void startElement(
String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (type == null) {
if (MIME_TYPE_TAG.equals(qName)) {
String name = attributes.getValue(MIME_TYPE_TYPE_ATTR);
String interpretedAttr = attributes.getValue(INTERPRETED_ATTR);
boolean interpreted = "true".equals(interpretedAttr);
try {
type = types.forName(name);
type.setInterpreted(interpreted);
} catch (MimeTypeException e) {
handleMimeError(name, e, qName, attributes);
}
}
} else if (ALIAS_TAG.equals(qName)) {
String alias = attributes.getValue(ALIAS_TYPE_ATTR);
types.addAlias(type, MediaType.parse(alias));
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
} else if (ACRONYM_TAG.equals(qName)||
COMMENT_TAG.equals(qName)||
TIKA_LINK_TAG.equals(qName)||
TIKA_UTI_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
String isRegex = attributes.getValue(ISREGEX_ATTR);
if (pattern != null) {
try {
types.addPattern(type, pattern, Boolean.valueOf(isRegex));
} catch (MimeTypeException e) {
handleGlobError(type, pattern, e, qName, attributes);
}
}
} else if (ROOT_XML_TAG.equals(qName)) {
String namespace = attributes.getValue(NS_URI_ATTR);
String name = attributes.getValue(LOCAL_NAME_ATTR);
type.addRootXML(namespace, name);
} else if (MATCH_TAG.equals(qName)) {
if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) {
current = new ClauseRecord(
new MinShouldMatchVal(
Integer.parseInt(
attributes.getValue(MATCH_MINSHOULDMATCH_ATTR))));
} else {
String kind = attributes.getValue(MATCH_TYPE_ATTR);
String offset = attributes.getValue(MATCH_OFFSET_ATTR);
String value = attributes.getValue(MATCH_VALUE_ATTR);
String mask = attributes.getValue(MATCH_MASK_ATTR);
if (kind == null) {
kind = "string";
}
current = new ClauseRecord(
new MagicMatch(type.getType(), kind, offset, value, mask));
}
} else if (MAGIC_TAG.equals(qName)) {
String value = attributes.getValue(MAGIC_PRIORITY_ATTR);
if (value != null && value.length() > 0) {
priority = Integer.parseInt(value);
} else {
priority = 50;
}
current = new ClauseRecord(null);
}
}
@Override
public void endElement(String uri, String localName, String qName) {
if (type != null) {
if (MIME_TYPE_TAG.equals(qName)) {
type = null;
} else if (COMMENT_TAG.equals(qName)) {
type.setDescription(characters.toString().trim());
characters = null;
} else if (ACRONYM_TAG.equals(qName)) {
type.setAcronym(characters.toString().trim());
characters = null;
} else if (TIKA_UTI_TAG.equals(qName)) {
type.setUniformTypeIdentifier(characters.toString().trim());
characters = null;
} else if (TIKA_LINK_TAG.equals(qName)) {
try {
type.addLink(new URI(characters.toString().trim()));
}
catch (URISyntaxException e) {
throw new IllegalArgumentException("unable to parse link: "+characters, e);
}
characters = null;
} else if (MATCH_TAG.equals(qName)) {
current.stop();
} else if (MAGIC_TAG.equals(qName)) {
for (Clause clause : current.getClauses()) {
type.addMagic(new Magic(type, priority, clause));
}
current = null;
}
}
}
@Override
public void characters(char[] ch, int start, int length) {
if (characters != null) {
characters.append(ch, start, length);
}
}
protected void handleMimeError(String input, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
throw new SAXException(ex);
}
protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex, String qName, Attributes attributes) throws SAXException {
throw new SAXException(ex);
}
private ClauseRecord current = new ClauseRecord(null);
private class ClauseRecord {
private ClauseRecord parent;
private Clause clause;
private List subclauses = null;
public ClauseRecord(Clause clause) {
this.parent = current;
this.clause = clause;
}
public void stop() {
if (clause instanceof MinShouldMatchVal) {
clause = new MinShouldMatchClause(((MinShouldMatchVal)clause).getVal(), subclauses);
} else if (subclauses != null) {
Clause subclause;
if (subclauses.size() == 1) {
subclause = subclauses.get(0);
} else {
subclause = new OrClause(subclauses);
}
clause = new AndClause(clause, subclause);
}
if (parent.subclauses == null) {
parent.subclauses = Collections.singletonList(clause);
} else {
if (parent.subclauses.size() == 1) {
parent.subclauses = new ArrayList(parent.subclauses);
}
parent.subclauses.add(clause);
}
current = current.parent;
}
public List getClauses() {
return subclauses;
}
}
/**
* Shim class used during building of actual classes.
* This temporarily holds the value of the minShouldMatchClause
* so that the actual MinShouldMatchClause can have a cleaner/immutable
* initialization.
*/
private static class MinShouldMatchVal implements Clause {
private final int val;
MinShouldMatchVal(int val) {
this.val = val;
}
int getVal() {
return val;
}
@Override
public boolean eval(byte[] data) {
throw new IllegalStateException("This should never be used " +
"on this placeholder class");
}
@Override
public int size() {
return 0;
}
}
/**
* Acquire a SAXParser from the pool; create one if it
* doesn't exist. Make sure to {@link #releaseParser(SAXParser)} in
* a finally block every time you call this.
*
* @return a SAXParser
* @throws TikaException
*/
private static SAXParser acquireSAXParser()
throws TikaException {
while (true) {
SAXParser parser = null;
try {
READ_WRITE_LOCK.readLock().lock();
parser = SAX_PARSERS.poll(10, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
throw new TikaException("interrupted while waiting for SAXParser", e);
} finally {
READ_WRITE_LOCK.readLock().unlock();
}
if (parser != null) {
return parser;
}
}
}
/**
* Return parser to the pool for reuse
*
* @param parser parser to return
*/
private static void releaseParser(SAXParser parser) {
try {
parser.reset();
} catch (UnsupportedOperationException e) {
//ignore
}
try {
READ_WRITE_LOCK.readLock().lock();
//if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
SAX_PARSERS.offer(parser);
} finally {
READ_WRITE_LOCK.readLock().unlock();
}
}
/**
* Set the pool size for cached XML parsers.
*
* @param poolSize
*/
public static void setPoolSize(int poolSize) throws TikaException {
try {
//stop the world with a write lock
//parsers that are currently in use will be offered, but not
//accepted and will be gc'd
READ_WRITE_LOCK.writeLock().lock();
SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
for (int i = 0; i < poolSize; i++) {
SAX_PARSERS.offer(newSAXParser());
}
POOL_SIZE = poolSize;
} finally {
READ_WRITE_LOCK.writeLock().unlock();
}
}
private static SAXParser newSAXParser() throws TikaException {
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(false);
try {
factory.setFeature(
XMLConstants.FEATURE_SECURE_PROCESSING, true);
} catch (ParserConfigurationException|SAXException e) {
LOG.log(Level.WARNING,
"can't set secure parsing feature on SAXParserFactory: " +
factory.getClass() + ". User assumes responsibility for consequences.");
}
try {
return factory.newSAXParser();
} catch (ParserConfigurationException | SAXException e) {
throw new TikaException("can't create saxparser", e);
}
}
}