org.apache.tika.parser.prt.PRTParser Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.prt;
import static java.nio.charset.StandardCharsets.US_ASCII;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* A basic text extracting parser for the CADKey PRT (CAD Drawing)
* format. It outputs text from note entries.
*/
public class PRTParser implements Parser {
public static final String PRT_MIME_TYPE = "application/x-prt";
/**
* Serial version UID
*/
private static final long serialVersionUID = 4659638314375035178L;
private static final Set SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-prt"));
/**
* How long do we allow a text run to claim to be, before we
* decide we're confused and it's not really text after all?
*/
private static final int MAX_TEXT_LENGTH = 0x0800;
public Set getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
/*
* Text types:
* 00 00 00 00 f0 [3b]f sz sz TEXT *view name*
* 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name*
* (anything) e0 3f sz sz TEXT *view name*
* 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries*
*
* Note - all text is null terminated
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
Last5 l5 = new Last5();
int read;
// Try to get the creation date, which is YYYYMMDDhhmm
byte[] header = new byte[30];
IOUtils.readFully(stream, header);
byte[] date = new byte[12];
IOUtils.readFully(stream, date);
String dateStr = new String(date, US_ASCII);
if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" +
dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
dateStr.substring(10, 12) + ":00";
metadata.set(TikaCoreProperties.CREATED, formattedDate);
// TODO Metadata.DATE is used as modified, should it be here?
metadata.set(TikaCoreProperties.CREATED, formattedDate);
}
metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
// The description, if set, is the next up-to-500 bytes
byte[] desc = new byte[500];
IOUtils.readFully(stream, desc);
String description = extractText(desc, true);
if (description.length() > 0) {
metadata.set(TikaCoreProperties.DESCRIPTION, description);
}
// Now look for text
while ((read = stream.read()) > -1) {
if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
int nread = stream.read();
if (nread == 0x3f || nread == 0xbf) {
// Looks promising, check back for a suitable value
if (read == 0xe3 && nread == 0x3f) {
if (l5.is33()) {
// Bingo, note text
handleNoteText(stream, xhtml);
}
} else if (l5.is00()) {
// Likely view name
handleViewName(read, nread, stream, xhtml, l5);
}
}
} else {
l5.record(read);
}
}
}
private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
// Ensure we have the right padding text
int read;
for (int i = 0; i < 10; i++) {
read = stream.read();
if (read >= 0 && read <= 0x0f) {
// Promising
} else {
// Wrong, false detection
return;
}
}
read = stream.read();
if (read != 0x1f) {
// Wrong, false detection
return;
}
int length = EndianUtils.readUShortLE(stream);
if (length <= MAX_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
}
}
private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLContentHandler xhtml,
Last5 l5) throws IOException, SAXException, TikaException {
// Is it 8 byte zero padded?
int maybeLength = EndianUtils.readUShortLE(stream);
if (maybeLength == 0) {
// Check the next 6 bytes too
for (int i = 0; i < 6; i++) {
int read = stream.read();
if (read >= 0 && read <= 0x0f) {
// Promising
} else {
// Wrong, false detection
return;
}
}
byte[] b2 = new byte[2];
IOUtils.readFully(stream, b2);
int length = EndianUtils.getUShortLE(b2);
if (length > 1 && length <= MAX_TEXT_LENGTH) {
// Length check passed
handleText(length, stream, xhtml);
} else {
// Was probably something else
l5.record(b2[0]);
l5.record(b2[1]);
}
} else if (maybeLength > 0 && maybeLength < MAX_TEXT_LENGTH) {
// Looks like it's straight into the text
handleText(maybeLength, stream, xhtml);
}
}
private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
byte[] str = new byte[length];
IOUtils.readFully(stream, str);
if (str[length - 1] != 0) {
// Not properly null terminated, must be wrong
return;
}
String text = extractText(str, false);
xhtml.startElement("p");
xhtml.characters(text);
xhtml.endElement("p");
}
/**
* Does our best to turn the bytes into text
*/
private String extractText(byte[] data, boolean trim) throws TikaException {
// The text is always stored null terminated, but sometimes
// may have extra null padding too
int length = data.length - 1;
if (trim) {
for (int i = 0; i < data.length; i++) {
if (data[i] == 0) {
length = i;
break;
}
}
}
// We believe that the text is basically stored as CP437
// That said, there are a few characters slightly wrong for that...
String text;
try {
text = new String(data, 0, length, "cp437");
} catch (UnsupportedEncodingException e) {
throw new TikaException("JVM Broken, core codepage CP437 missing!");
}
// Fix up the known character issues
text = text.replace("\u03C6", "\u00D8");
// All done, as best as we can!
return text;
}
/**
* Provides a view on the previous 5 bytes
*/
private static class Last5 {
byte[] data = new byte[5];
int pos = 0;
private void record(int b) {
data[pos] = (byte) b;
pos++;
if (pos >= data.length) {
pos = 0;
}
}
private byte[] get() {
byte[] ret = new byte[5];
for (int i = 0; i < ret.length; i++) {
int p = pos - i;
if (p < 0) {
p += ret.length;
}
ret[i] = data[p];
}
return ret;
}
private boolean is33() {
byte[] last5 = get();
for (byte b : last5) {
if (b != 0x33) {
return false;
}
}
return true;
}
private boolean is00() {
byte[] last5 = get();
for (byte b : last5) {
if (b != 0x00) {
return false;
}
}
return true;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy