org.apache.tika.parser.prt.PRTParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.prt;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import static java.nio.charset.StandardCharsets.US_ASCII;
/**
* A basic text extracting parser for the CADKey PRT (CAD Drawing)
* format. It outputs text from note entries.
*/
public class PRTParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = 4659638314375035178L;
private static final Set SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt"));
public static final String PRT_MIME_TYPE = "application/x-prt";
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
/**
* How long do we allow a text run to claim to be, before we
* decide we're confused and it's not really text after all?
*/
private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
/*
* Text types:
* 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
* 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
* (anything) e0 3f sz sz TEXT *view name*
* 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
*
* Note - all text is null terminated
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Last5 l5 = new Last5();
int read;
// Try to get the creation date, which is YYYYMMDDhhmm
byte[] header = new byte[30];
IOUtils.readFully(stream, header);
byte[] date = new byte[12];
IOUtils.readFully(stream, date);
String dateStr = new String(date, US_ASCII);
if(dateStr.startsWith("19") || dateStr.startsWith("20")) {
String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) +
"-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" +
dateStr.substring(10, 12) + ":00";
metadata.set(TikaCoreProperties.CREATED, formattedDate);
// TODO Metadata.DATE is used as modified, should it be here?
metadata.set(Metadata.DATE, formattedDate);
}
metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
// The description, if set, is the next up-to-500 bytes
byte[] desc = new byte[500];
IOUtils.readFully(stream, desc);
String description = extractText(desc, true);
if(description.length() > 0) {
metadata.set(TikaCoreProperties.DESCRIPTION, description);
}
// Now look for text
while( (read = stream.read()) > -1) {
if(read == 0xe0 || read == 0xe3 || read == 0xf0) {
int nread = stream.read();
if(nread == 0x3f || nread == 0xbf) {
// Looks promising, check back for a suitable value
if(read == 0xe3 && nread == 0x3f) {
if(l5.is33()) {
// Bingo, note text
handleNoteText(stream, xhtml);
}
} else if(l5.is00()) {
// Likely view name
handleViewName(read, nread, stream, xhtml, l5);
}
}
} else {
l5.record(read);
}
}
}
private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
// Ensure we have the right padding text
int read;
for(int i=0; i<10; i++) {
read = stream.read();
if(read >= 0 && read <= 0x0f) {
// Promising
} else {
// Wrong, false detection
return;
}
}
read = stream.read();
if(read != 0x1f) {
// Wrong, false detection
return;
}
int length = EndianUtils.readUShortLE(stream);
if(length <= MAX_SANE_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
}
}
private void handleViewName(int typeA, int typeB, InputStream stream,
XHTMLContentHandler xhtml, Last5 l5)
throws IOException, SAXException, TikaException {
// Is it 8 byte zero padded?
int maybeLength = EndianUtils.readUShortLE(stream);
if(maybeLength == 0) {
// Check the next 6 bytes too
for(int i=0; i<6; i++) {
int read = stream.read();
if(read >= 0 && read <= 0x0f) {
// Promising
} else {
// Wrong, false detection
return;
}
}
byte[] b2 = new byte[2];
IOUtils.readFully(stream, b2);
int length = EndianUtils.getUShortLE(b2);
if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
} else {
// Was probably something else
l5.record(b2[0]);
l5.record(b2[1]);
}
} else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
// Looks like it's straight into the text
handleText(maybeLength, stream, xhtml);
}
}
private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
byte[] str = new byte[length];
IOUtils.readFully(stream, str);
if(str[length-1] != 0) {
// Not properly null terminated, must be wrong
return;
}
String text = extractText(str, false);
xhtml.startElement("p");
xhtml.characters(text);
xhtml.endElement("p");
}
/**
* Does our best to turn the bytes into text
*/
private String extractText(byte[] data, boolean trim) throws TikaException {
// The text is always stored null terminated, but sometimes
// may have extra null padding too
int length = data.length - 1;
if(trim) {
for(int i=0; i= data.length) {
pos = 0;
}
}
private byte[] get() {
byte[] ret = new byte[5];
for(int i=0; i