org.apache.tika.parser.prt.PRTParser Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.prt;

import static java.nio.charset.StandardCharsets.US_ASCII;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;

import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;

/**
 * A basic text extracting parser for the CADKey PRT (CAD Drawing)
 * format. It outputs text from note entries.
 */

public class PRTParser implements Parser {

    public static final String PRT_MIME_TYPE = "application/x-prt";
    /**
     * Serial version UID
     */
    private static final long serialVersionUID = 4659638314375035178L;
    private static final Set SUPPORTED_TYPES =
            Collections.singleton(MediaType.application("x-prt"));
    /**
     * How long do we allow a text run to claim to be, before we
     * decide we're confused and it's not really text after all?
     */
    private static final int MAX_TEXT_LENGTH = 0x0800;

    public Set getSupportedTypes(ParseContext context) {
        return SUPPORTED_TYPES;
    }

    /*
     * Text types:
     *   00 00 00 00 f0 [3b]f sz sz TEXT     *view name*
     *   00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT  *view name*
     *   (anything)  e0 3f sz sz TEXT    *view name*
     *   3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT    *note entries*
     *
     *  Note - all text is null terminated
     */

    public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                      ParseContext context) throws IOException, SAXException, TikaException {

        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        Last5 l5 = new Last5();
        int read;

        // Try to get the creation date, which is YYYYMMDDhhmm
        byte[] header = new byte[30];
        IOUtils.readFully(stream, header);
        byte[] date = new byte[12];
        IOUtils.readFully(stream, date);

        String dateStr = new String(date, US_ASCII);
        if (dateStr.startsWith("19") || dateStr.startsWith("20")) {
            String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + "-" +
                    dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" +
                    dateStr.substring(10, 12) + ":00";
            metadata.set(TikaCoreProperties.CREATED, formattedDate);
            // TODO Metadata.DATE is used as modified, should it be here?
            metadata.set(TikaCoreProperties.CREATED, formattedDate);
        }
        metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);

        // The description, if set, is the next up-to-500 bytes
        byte[] desc = new byte[500];
        IOUtils.readFully(stream, desc);
        String description = extractText(desc, true);
        if (description.length() > 0) {
            metadata.set(TikaCoreProperties.DESCRIPTION, description);
        }

        // Now look for text
        while ((read = stream.read()) > -1) {
            if (read == 0xe0 || read == 0xe3 || read == 0xf0) {
                int nread = stream.read();
                if (nread == 0x3f || nread == 0xbf) {
                    // Looks promising, check back for a suitable value
                    if (read == 0xe3 && nread == 0x3f) {
                        if (l5.is33()) {
                            // Bingo, note text
                            handleNoteText(stream, xhtml);
                        }
                    } else if (l5.is00()) {
                        // Likely view name
                        handleViewName(read, nread, stream, xhtml, l5);
                    }
                }
            } else {
                l5.record(read);
            }
        }
    }

    private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {
        // Ensure we have the right padding text
        int read;
        for (int i = 0; i < 10; i++) {
            read = stream.read();
            if (read >= 0 && read <= 0x0f) {
                // Promising
            } else {
                // Wrong, false detection
                return;
            }
        }
        read = stream.read();
        if (read != 0x1f) {
            // Wrong, false detection
            return;
        }

        int length = EndianUtils.readUShortLE(stream);
        if (length <= MAX_TEXT_LENGTH) {
            // Length check passed
            handleText(length, stream, xhtml);
        }
    }

    private void handleViewName(int typeA, int typeB, InputStream stream, XHTMLContentHandler xhtml,
                                Last5 l5) throws IOException, SAXException, TikaException {
        // Is it 8 byte zero padded?
        int maybeLength = EndianUtils.readUShortLE(stream);
        if (maybeLength == 0) {
            // Check the next 6 bytes too
            for (int i = 0; i < 6; i++) {
                int read = stream.read();
                if (read >= 0 && read <= 0x0f) {
                    // Promising
                } else {
                    // Wrong, false detection
                    return;
                }
            }

            byte[] b2 = new byte[2];
            IOUtils.readFully(stream, b2);
            int length = EndianUtils.getUShortLE(b2);
            if (length > 1 && length <= MAX_TEXT_LENGTH) {
                // Length check passed
                handleText(length, stream, xhtml);
            } else {
                // Was probably something else
                l5.record(b2[0]);
                l5.record(b2[1]);
            }
        } else if (maybeLength > 0 && maybeLength < MAX_TEXT_LENGTH) {
            // Looks like it's straight into the text
            handleText(maybeLength, stream, xhtml);
        }
    }

    private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {
        byte[] str = new byte[length];
        IOUtils.readFully(stream, str);
        if (str[length - 1] != 0) {
            // Not properly null terminated, must be wrong
            return;
        }

        String text = extractText(str, false);

        xhtml.startElement("p");
        xhtml.characters(text);
        xhtml.endElement("p");
    }

    /**
     * Does our best to turn the bytes into text
     */
    private String extractText(byte[] data, boolean trim) throws TikaException {
        // The text is always stored null terminated, but sometimes
        //  may have extra null padding too
        int length = data.length - 1;
        if (trim) {
            for (int i = 0; i < data.length; i++) {
                if (data[i] == 0) {
                    length = i;
                    break;
                }
            }
        }

        // We believe that the text is basically stored as CP437
        // That said, there are a few characters slightly wrong for that...
        String text;
        try {
            text = new String(data, 0, length, "cp437");
        } catch (UnsupportedEncodingException e) {
            throw new TikaException("JVM Broken, core codepage CP437 missing!");
        }

        // Fix up the known character issues
        text = text.replace("\u03C6", "\u00D8");

        // All done, as best as we can!
        return text;
    }

    /**
     * Provides a view on the previous 5 bytes
     */
    private static class Last5 {
        byte[] data = new byte[5];
        int pos = 0;

        private void record(int b) {
            data[pos] = (byte) b;
            pos++;
            if (pos >= data.length) {
                pos = 0;
            }
        }

        private byte[] get() {
            byte[] ret = new byte[5];
            for (int i = 0; i < ret.length; i++) {
                int p = pos - i;
                if (p < 0) {
                    p += ret.length;
                }
                ret[i] = data[p];
            }
            return ret;
        }

        private boolean is33() {
            byte[] last5 = get();
            for (byte b : last5) {
                if (b != 0x33) {
                    return false;
                }
            }
            return true;
        }

        private boolean is00() {
            byte[] last5 = get();
            for (byte b : last5) {
                if (b != 0x00) {
                    return false;
                }
            }
            return true;
        }
    }
}