org.apache.tika.parser.wordperfect.QPWTextExtractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tika-parsers Show documentation
There is a newer version: 1.0.18
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.wordperfect;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.QuattroPro;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;

/**
 * Extracts text from a Quattro Pro document according to QPW v9 File Format.
 * This format appears to be compatible with more recent versions too.
 * @author Pascal Essiembre
 */
class QPWTextExtractor {

    private static final Logger LOG = 
            LogManager.getLogger(QPWTextExtractor.class);
    
    private static final String OLE_DOCUMENT_NAME = "NativeContent_MAIN";

    private enum Extractor{
        IGNORE { @Override public void extract(Context ctx) throws IOException {
            ctx.in.skipWPByte(ctx.bodyLength);
        }},
        BOF { @Override public void extract(Context ctx) throws IOException {
            ctx.metadata.set(QuattroPro.ID, ctx.in.readWPString(4));
            ctx.metadata.set(QuattroPro.VERSION, ctx.in.readWPShort());
            ctx.metadata.set(QuattroPro.BUILD, ctx.in.readWPShort());
            ctx.in.readWPShort(); // Last saved bits
            ctx.metadata.set(QuattroPro.LOWEST_VERSION, ctx.in.readWPShort());
            ctx.metadata.set(Office.PAGE_COUNT, ctx.in.readWPShort());
            ctx.in.skipWPByte(ctx.bodyLength - 14);
        }},
        USER { @Override public void extract(Context ctx) throws IOException {
            ctx.metadata.set(TikaCoreProperties.CREATOR, getQstrLabel(ctx.in));
            ctx.metadata.set(TikaCoreProperties.MODIFIER, getQstrLabel(ctx.in));
        }},
        EXT_LINK { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            ctx.in.readWPShort(); // index
            ctx.in.readWPShort(); // page first
            ctx.in.readWPShort(); // page last
            ctx.xhtml.characters(getQstrLabel(ctx.in));
            ctx.xhtml.characters(System.lineSeparator());
        }},
        STRING_TABLE { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            long entries = ctx.in.readWPLong();
            ctx.in.readWPLong();  // Total used
            ctx.in.readWPLong();  // Total saved
            for (int i = 0; i < entries; i++) {
                ctx.xhtml.characters(getQstrLabel(ctx.in));
                ctx.xhtml.characters(System.lineSeparator());
            }
        }},
        BOS { @Override public void extract(Context ctx)
                throws IOException, SAXException {
            ctx.in.readWPShort(); // sheet #
            ctx.in.readWPShort(); // first col index
            ctx.in.readWPShort(); // last col index
            ctx.in.readWPLong();  // first row index
            ctx.in.readWPLong();  // last row index
            ctx.in.readWPShort(); // format
            ctx.in.readWPShort(); // flags
            ctx.xhtml.characters(getQstrLabel(ctx.in));
            ctx.xhtml.characters(System.lineSeparator());
        }},
        SHEET_HEADFOOT { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            ctx.in.readWPShort(); // flag
            ctx.xhtml.characters(getQstrLabel(ctx.in));
            ctx.xhtml.characters(System.lineSeparator());
        }},
        FORMULA_STRING_VALUE { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            ctx.in.readWPShort(); // column
            ctx.in.readWPLong();  // row
            ctx.xhtml.characters(getQstrLabel(ctx.in));
        }},
        CGENERICLABEL { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            ctx.in.readWPShort(); // column
            ctx.in.readWPLong();  // row
            ctx.in.readWPShort(); // format index
            ctx.xhtml.characters(getQstrLabel(ctx.in));
        }},
        CCOMMENT { @Override public void extract(Context ctx) 
                throws IOException, SAXException {
            ctx.in.readWPShort(); // column
            ctx.in.readWPLong();  // row
            ctx.in.readWPLong();  // flag
            ctx.xhtml.characters(getQstrLabel(ctx.in));  // author name
            ctx.xhtml.characters(getQstrLabel(ctx.in));  // comment
        }},
        // Use to print out a chunk
        DEBUG { @Override public void extract(Context ctx) throws IOException {
            LOG.error("REC ("
                    + Integer.toHexString(ctx.type) + "/" + ctx.bodyLength 
                    + "):" + ctx.in.readWPString(ctx.bodyLength));
        }};
        public abstract void extract(Context ctx) 
                throws IOException, SAXException;
    }
    
    // Holds extractors for each record types we are interested in.
    // All record types not defined here will be skipped.
    private static final Map EXTRACTORS = 
            new HashMap<>();
    static {
        //--- Global Records ---
        EXTRACTORS.put(0x0001, Extractor.BOF);     // Beginning of file
        EXTRACTORS.put(0x0005, Extractor.USER);    // User

        //--- Notebook Records ---
        EXTRACTORS.put(0x0403, Extractor.EXT_LINK);// External link
        EXTRACTORS.put(0x0407, Extractor.STRING_TABLE); // String table

        //--- Sheet Records ---
        EXTRACTORS.put(0x0601, Extractor.BOS); // Beginning of sheet
        EXTRACTORS.put(0x0605, Extractor.SHEET_HEADFOOT); // Sheet header
        EXTRACTORS.put(0x0606, Extractor.SHEET_HEADFOOT); // Sheet footer

        //--- Cells ---
        EXTRACTORS.put(0x0c02, Extractor.FORMULA_STRING_VALUE); 
        EXTRACTORS.put(0x0c72, Extractor.CGENERICLABEL); 
        EXTRACTORS.put(0x0c80, Extractor.CCOMMENT); 
    }
    
    class Context {
        private final WPInputStream in;
        private final XHTMLContentHandler xhtml;
        private final Metadata metadata;
        private int type;
        private int bodyLength;
        public Context(WPInputStream in, XHTMLContentHandler xhtml, 
                Metadata metadata) {
            super();
            this.in = in;
            this.xhtml = xhtml;
            this.metadata = metadata;
        }
    }
    
    @SuppressWarnings("resource")
    public void extract(
            InputStream input, XHTMLContentHandler xhtml, Metadata metadata)
                    throws IOException, SAXException, TikaException {
        
        POIFSFileSystem pfs = new POIFSFileSystem(input);
        DirectoryNode rootNode = pfs.getRoot();
        if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
            throw new UnsupportedFormatException("Unsupported QuattroPro file format. "
                    + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME
                    + "\". Found: "+ rootNode.getEntryNames());
        }
        
        //TODO shall we validate and throw warning/error if the file does not 
        //start with a BOF and ends with a EOF?
        xhtml.startElement("p");
        try (WPInputStream in = new WPInputStream(
                pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
            Context ctx = new Context(in, xhtml, metadata);
            while (hasNext(in)) {
                ctx.type = in.readWPShort();
                ctx.bodyLength = in.readWPShort();
                Extractor extractor = EXTRACTORS.get(ctx.type);
                if (extractor != null) {
                    extractor.extract(ctx);
                } else {
                    // Use DEBUG to find out what we are ignoring
//                    Extractor.DEBUG.extract(ctx);
                    Extractor.IGNORE.extract(ctx);
                }
            }
        }
        xhtml.endElement("p");
    }
    
    private boolean hasNext(InputStream in) throws IOException {
        try {
            in.mark(1);
            return in.read() != -1;
        } finally {
            in.reset();
        }
    }
    
    private static String getQstrLabel(WPInputStream in) throws IOException {
        // QSTR
        int count = in.readWPShort();
        in.readWPByte(); // string type
        char[] text = new char[count+1];
        text[0] = in.readWPChar();

        // QSTRLABEL
        for (int i = 0; i < count; i++) {
            text[i+1] = in.readWPChar();
        }
        return new String(text);
    }
}