All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.hwp.HwpTextExtractorV5 Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.hwp;

import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;

import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.security.InvalidKeyException;
import java.security.Key;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;

import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.Property;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;

public class HwpTextExtractorV5 implements Serializable {
    private static final long serialVersionUID = 1L;
    protected static Logger LOG = LoggerFactory
            .getLogger(HwpTextExtractorV5.class);

    private static final byte[] HWP_V5_SIGNATURE = "HWP Document File"
            .getBytes(StandardCharsets.US_ASCII);

    private static final int HWPTAG_BEGIN = 0x010;

    private static final int I = 1; // INLINE
    private static final int C = 2; // CONTROL
    private static final int X = 3; // EXTENDED

    private static final int[] HWP_CHAR_TYPE = new int[]{
            C, X, X, X, I, I, I, I, I, I, // 0-9
            C, X, X, C, X, X, X, X, X, I, // 10-19
            I, X, X, X, C, C, C, C, C, C, // 20-29
            C, C}; // 30-31


    /**
     * extract Text from HWP Stream.
     *
     * @param source
     * @param metadata
     * @param xhtml
     * @return
     * @throws FileNotFoundException
     * @throws IOException
     * @throws SAXException
     */
    public void extract(InputStream source, Metadata metadata, XHTMLContentHandler xhtml)
            throws FileNotFoundException, IOException,
            TikaException, SAXException {
        if (source == null || xhtml == null)
            throw new IllegalArgumentException();

        POIFSFileSystem fs = null;
        try {
            fs = new POIFSFileSystem(new CloseShieldInputStream(source));

            DirectoryNode root = fs.getRoot();
            extract0(root, metadata, xhtml);

        } catch (IOException e) {
            throw new TikaException(
                    "error occurred when parsing HWP Format, It may not HWP Format.", e);
        } finally {
            IOUtils.closeQuietly(fs);
        }
    }

    private void extract0(DirectoryNode root, Metadata metadata, XHTMLContentHandler xhtml)
            throws IOException, SAXException, TikaException {

        Entry headerEntry = root.getEntry("FileHeader");
        if (!headerEntry.isDocumentEntry()) {
            throw new UnsupportedFormatException("cannot parse the File Header");
        }

        FileHeader header = getHeader(headerEntry);

        if (header == null) {
            throw new UnsupportedFormatException("cannot parse the File Header");
        }

        if (header.encrypted) {
            throw new EncryptedDocumentException("document is encrypted");
        }

        parseSummaryInformation(root, metadata);

        if (header.viewtext) {
            parseViewText(header, root, xhtml);
        } else {
            parseBodyText(header, root, xhtml);
        }

    }

    private void parseSummaryInformation(DirectoryNode root, Metadata metadata) throws TikaException {

        try {
            Entry summaryEntry = root.getEntry("\u0005HwpSummaryInformation");

            populateMatadata(summaryEntry, metadata);

        } catch (NoPropertySetStreamException | IOException e) {
            throw new UnsupportedFormatException(
                    "cannot parse the Summary Information");
        }

    }

    private void populateMatadata(Entry summaryEntry, Metadata metadata) throws IOException, NoPropertySetStreamException {

        DocumentInputStream summaryStream = new DocumentInputStream(
                (DocumentEntry) summaryEntry);

        PropertySet ps = new PropertySet(summaryStream);

        Property[] props = ps.getProperties();

        for (Property prop : props) {
            int propID = (int) prop.getID();
            Object value = prop.getValue();

            switch (propID) {
                case 2:
                    metadata.set(TikaCoreProperties.TITLE, (String) value);
                    break;
                case 3:
                    metadata.set(OfficeOpenXMLCore.SUBJECT, (String) value);
                    break;
                case 4:
                    metadata.set(TikaCoreProperties.CREATOR, (String) value);
                    break;
                case 5:
                    metadata.set(Office.KEYWORDS, (String) value);
                    break;
                case 6:
                    metadata.set(TikaCoreProperties.COMMENTS, (String) value);
                    break;
                case 8:
                    metadata.set(TikaCoreProperties.MODIFIER, (String) value);
                    break;
                case 12:
                    metadata.set(TikaCoreProperties.CREATED, (Date) value);
                    break;
                case 13:
                    metadata.set(TikaCoreProperties.MODIFIED, (Date) value);
                    break;
                case 14:
                    metadata.set(Office.PAGE_COUNT, (int) value);
                    break;
                default:
            }
        }
    }

    /**
     * extract the HWP File Header
     *
     * @param headerEntry
     * @return
     * @throws IOException
     */
    private FileHeader getHeader(Entry headerEntry) throws IOException {
        // confirm signature
        byte[] header = new byte[256]; // the length of File header is 256

        try (DocumentInputStream headerStream = new DocumentInputStream(
                (DocumentEntry) headerEntry)) {
            int read = headerStream.read(header);
            if (read != 256
                    || !Arrays.equals(HWP_V5_SIGNATURE, Arrays.copyOfRange(
                    header, 0, HWP_V5_SIGNATURE.length)))
                return null;
        }

        FileHeader fileHeader = new FileHeader();

        // version. debug
        fileHeader.version = HwpVersion.parseVersion(LittleEndian.getUInt(
                header, 32));
        long flags = LittleEndian.getUInt(header, 36);
        LOG.debug("Flags={}", Long.toBinaryString(flags).replace(' ', '0'));

        fileHeader.compressed = (flags & 0x01) == 0x01;
        fileHeader.encrypted = (flags & 0x02) == 0x02;
        fileHeader.viewtext = (flags & 0x04) == 0x04;

        return fileHeader;
    }

    /**
     * extract Text
     *
     * @param header
     * @param root
     * @param xhtml
     * @return
     * @throws IOException
     * @throws SAXException
     */
    private void parseBodyText(FileHeader header, DirectoryNode root,
                               XHTMLContentHandler xhtml) throws IOException, SAXException {
        // read BodyText
        Entry bodyText = root.getEntry("BodyText");
        if (bodyText == null || !bodyText.isDirectoryEntry()) {
            throw new IOException("Invalid BodyText");
        }

        Iterator iterator = ((DirectoryEntry) bodyText).getEntries();
        while (iterator.hasNext()) {
            Entry entry = iterator.next();
            if (entry.getName().startsWith("Section")
                    && entry instanceof DocumentEntry) {
                LOG.debug("extract {}", entry.getName());

                InputStream input = new DocumentInputStream(
                        (DocumentEntry) entry);

                if (header.compressed) {
                    input = new InflaterInputStream(input, new Inflater(true));
                }

                HwpStreamReader reader = new HwpStreamReader(input);

                parse(reader, xhtml);

            } else {
                LOG.warn("Unknown Entry '{}'({})", entry.getName(), entry);
            }
        }
    }

    /**
     * 텍스트 추출
     *
     * @param header
     * @param root
     * @param xhtml
     * @return
     * @throws IOException
     */
    private void parseViewText(FileHeader header, DirectoryNode root,
                               XHTMLContentHandler xhtml) throws IOException {
        // read BodyText
        Entry bodyText = root.getEntry("ViewText");
        if (bodyText == null || !bodyText.isDirectoryEntry()) {
            throw new IOException("Invalid ViewText");
        }

        Iterator iterator = ((DirectoryEntry) bodyText).getEntries();
        while (iterator.hasNext()) {
            Entry entry = iterator.next();
            if (entry.getName().startsWith("Section")
                    && entry instanceof DocumentEntry) {
                LOG.debug("extract {}", entry.getName());

                InputStream input = new DocumentInputStream(
                        (DocumentEntry) entry);

                try {
                    Key key = readKey(input);
                    input = createDecryptStream(input, key);
                    if (header.compressed) {
                        input = new InflaterInputStream(input, new Inflater(
                                true));
                    }

                    HwpStreamReader sectionStream = new HwpStreamReader(input);
                    parse(sectionStream, xhtml);
                } catch (InvalidKeyException e) {
                    throw new IOException(e);
                } catch (NoSuchAlgorithmException e) {
                    throw new IOException(e);
                } catch (NoSuchPaddingException e) {
                    throw new IOException(e);
                } catch (SAXException e) {
                    throw new IOException(e);
                } finally {
                    IOUtils.closeQuietly(input);
                }
            } else {
                LOG.warn("unknown Entry '{}'({})", entry.getName(), entry);
            }
        }
    }

    private Key readKey(InputStream input) throws IOException {
        byte[] data = new byte[260];

        if (IOUtils.readFully(input, data, 0, 4) != 4) {// TAG,
            throw new EOFException();
        }

        if (IOUtils.readFully(input, data, 0, 256) != 256) {
            throw new EOFException();
        }

        SRand srand = new SRand(LittleEndian.getInt(data));
        byte xor = 0;
        for (int i = 0, n = 0; i < 256; i++, n--) {
            if (n == 0) {
                xor = (byte) (srand.rand() & 0xFF);
                n = (int) ((srand.rand() & 0xF) + 1);
            }
            if (i >= 4) {
                data[i] = (byte) ((data[i]) ^ (xor));
            }
        }

        int offset = 4 + (data[0] & 0xF); // 4 + (0~15) ?
        byte[] key = Arrays.copyOfRange(data, offset, offset + 16);

        SecretKeySpec secretKey = new SecretKeySpec(key, "AES");
        return secretKey;
    }

    public InputStream createDecryptStream(InputStream input, Key key)
            throws NoSuchAlgorithmException,
            NoSuchPaddingException, InvalidKeyException {
        Cipher cipher = null;

        cipher = Cipher.getInstance("AES/ECB/NoPadding");
        cipher.init(Cipher.DECRYPT_MODE, key);

        return new CipherInputStream(input, cipher);
    }

    /**
     * extract characters from Section stream
     *
     * @param reader
     * @param xhtml
     * @throws IOException
     * @throws SAXException
     */
    private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
            throws IOException, SAXException {
        StringBuffer buf = new StringBuffer(1024);
        TagInfo tag = new TagInfo();

        while (true) {
            if (!readTag(reader, tag))
                break;

            if (HWPTAG_BEGIN + 51 == tag.id) {
                if (tag.length % 2 != 0) {
                    throw new IOException("Invalid block size");
                }
                buf.setLength(0);
                writeParaText(reader, tag.length, buf);

                if (buf.length() > 0) {
                    buf.append('\n');

                    xhtml.startElement("p");
                    xhtml.characters(buf.toString());
                    xhtml.endElement("p");
                }
            } else {
                reader.ensureSkip(tag.length);
            }
        }
    }


    /**
     * transfer character stream of HWPTAG_PARA_TEXT to STRING
     *
     * @param reader
     * @param datasize
     * @param buf
     * @throws IOException
     */
    private void writeParaText(HwpStreamReader reader, long datasize,
                               StringBuffer buf) throws IOException {
        int[] chars = reader.uint16((int) (datasize / 2));

        for (int index = 0; index < chars.length; index++) {
            int ch = chars[index];
            if (ch < 32) {
                if (ch == 9) { // tab, INLINE
                    buf.append('\t');
                    index += 7;
                } else {
                    int type = HWP_CHAR_TYPE[ch];
                    if (I == type) { // INLINE
                        index += 7;
                    } else if (X == type) { // EXTENDED
                        index += 7;
                    } else if (C == type) { // CONTROL
                        buf.append(' ');
                    }
                }
            } else {
                buf.append((char) ch);
            }
        }
    }

    private boolean readTag(HwpStreamReader reader, TagInfo tag)
            throws IOException {
        // see p.24 of hwp 5.0 format guide

        long recordHeader = reader.uint32();
        if (recordHeader == -1)
            return false;

        tag.id = recordHeader & 0x3FF;
        tag.level = (recordHeader >> 10) & 0x3FF;
        tag.length = (recordHeader >> 20) & 0xFFF;

        // see p.24 of hwp 5.0 format guide
        if (tag.length == 0xFFF)
            tag.length = reader.uint32();

        return true;
    }

    private static class SRand {
        private int random_seed;

        private SRand(int seed) {
            random_seed = seed;
        }

        private int rand() {
            random_seed = (random_seed * 214013 + 2531011) & 0xFFFFFFFF;
            return (random_seed >> 16) & 0x7FFF;
        }
    }

    static class FileHeader {
        HwpVersion version;
        boolean compressed; // bit 0
        boolean encrypted; // bit 1
        boolean viewtext; // bit 2
    }

    static class TagInfo {
        long id;
        long level;
        long length;
    }

    static class HwpVersion {
        int m;
        int n;
        int p;
        int r;

        public String toString() {
            return String.format(
                    Locale.US, "%d.%d.%d.%d", m, n, p, r);
        }

        public static HwpVersion parseVersion(long longVersion) {
            HwpVersion version = new HwpVersion();
            version.m = (int) ((longVersion & 0xFF000000L) >> 24);
            version.n = (int) ((longVersion & 0x00FF0000L) >> 16);
            version.p = (int) ((longVersion & 0x0000FF00L) >> 8);
            version.r = (int) ((longVersion & 0x000000FFL));
            return version;
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy