All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.chm.core.ChmExtractor Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.chm.core;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
import org.apache.tika.parser.chm.accessor.ChmItsfHeader;
import org.apache.tika.parser.chm.accessor.ChmItspHeader;
import org.apache.tika.parser.chm.accessor.ChmLzxcControlData;
import org.apache.tika.parser.chm.accessor.ChmLzxcResetTable;
import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
import org.apache.tika.parser.chm.assertion.ChmAssert;
import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Extracts text from chm file. Enumerates chm entries.
 */
public class ChmExtractor {

    private static final Logger LOG = LoggerFactory.getLogger(ChmExtractor.class);


    private List lzxBlocksCache = null;
    private ChmDirectoryListingSet chmDirList = null;
    private ChmItsfHeader chmItsfHeader = null;
    private ChmItspHeader chmItspHeader = null;
    private ChmLzxcResetTable chmLzxcResetTable = null;
    private ChmLzxcControlData chmLzxcControlData = null;
    private byte[] data = null;
    private int indexOfContent;
    private long lzxBlockOffset;
    private long lzxBlockLength;
    private ChmBlockInfo chmBlockInfo = null;//this will be instantiated at first call of

    /**
     * Returns lzxc control data.
     * 
     * @return ChmLzxcControlData
     */
    private ChmLzxcControlData getChmLzxcControlData() {
        return chmLzxcControlData;
    }

    /**
     * Sets lzxc control data
     * 
     * @param chmLzxcControlData
     */
    private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) {
        this.chmLzxcControlData = chmLzxcControlData;
    }

    private ChmItspHeader getChmItspHeader() {
        return chmItspHeader;
    }

    private void setChmItspHeader(ChmItspHeader chmItspHeader) {
        this.chmItspHeader = chmItspHeader;
    }

    /**
     * Returns lzxc reset table
     * 
     * @return ChmLzxcResetTable
     */
    private ChmLzxcResetTable getChmLzxcResetTable() {
        return chmLzxcResetTable;
    }

    /**
     * Sets lzxc reset table
     * 
     * @param chmLzxcResetTable
     */
    private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) {
        this.chmLzxcResetTable = chmLzxcResetTable;
    }

    /**
     * Returns lzxc hit_cache length
     * 
     * @return lzxBlockLength
     */
    private long getLzxBlockLength() {
        return lzxBlockLength;
    }

    /**
     * Sets lzxc hit_cache length
     * 
     * @param lzxBlockLength
     */
    private void setLzxBlockLength(long lzxBlockLength) {
        this.lzxBlockLength = lzxBlockLength;
    }

    /**
     * Returns lzxc hit_cache offset
     * 
     * @return lzxBlockOffset
     */
    private long getLzxBlockOffset() {
        return lzxBlockOffset;
    }

    /**
     * Sets lzxc hit_cache offset
     */
    private void setLzxBlockOffset(long lzxBlockOffset) {
        this.lzxBlockOffset = lzxBlockOffset;
    }

    private int getIndexOfContent() {
        return indexOfContent;
    }

    private void setIndexOfContent(int indexOfContent) {
        this.indexOfContent = indexOfContent;
    }

    private byte[] getData() {
        return data;
    }

    private void setData(byte[] data) {
        this.data = data;
    }

    public ChmExtractor(InputStream is) throws TikaException, IOException {
        ChmAssert.assertInputStreamNotNull(is);
        try {
            setData(IOUtils.toByteArray(is));

            /* Creates and parses chm itsf header */
            setChmItsfHeader(new ChmItsfHeader());
            // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0,
            // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());
            getChmItsfHeader().parse(ChmCommons.copyOfRange(getData(), 0,
                            ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader());

            /* Creates and parses chm itsp header */
            setChmItspHeader(new ChmItspHeader());
            // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int)
            // getChmItsfHeader().getDirOffset(),
            // (int) getChmItsfHeader().getDirOffset() +
            // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());
            getChmItspHeader().parse(
                    ChmCommons.copyOfRange(getData(), (int) getChmItsfHeader()
                            .getDirOffset(), (int) getChmItsfHeader().getDirOffset() + 
                            ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader());

            /* Creates instance of ChmDirListingContainer */
            setChmDirList(new ChmDirectoryListingSet(getData(),
                    getChmItsfHeader(), getChmItspHeader()));

            int indexOfControlData = getChmDirList().getControlDataIndex();
            int indexOfResetData = ChmCommons.indexOfResetTableBlock(getData(),
                    ChmConstants.LZXC.getBytes(UTF_8));
            byte[] dir_chunk = null;
            if (indexOfResetData > 0)
                dir_chunk = ChmCommons.copyOfRange( getData(), indexOfResetData, indexOfResetData  
                        + getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());
            // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData,
            // indexOfResetData
            // +
            // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength());

            /* Creates and parses chm control data */
            setChmLzxcControlData(new ChmLzxcControlData());
            getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData());

            int indexOfResetTable = getChmDirList().getResetTableIndex();
            setChmLzxcResetTable(new ChmLzxcResetTable());

            int startIndex = (int) getChmDirList().getDataOffset()
                    + getChmDirList().getDirectoryListingEntryList()
                            .get(indexOfResetTable).getOffset();

            // assert startIndex < data.length
            ChmAssert.assertCopyingDataIndex(startIndex, getData().length);

            // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex
            // +
            // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());
            dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex
                            + getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength());

            getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable());

            setIndexOfContent(ChmCommons.indexOf(getChmDirList().getDirectoryListingEntryList(), 
                    ChmConstants.CONTENT));
            setLzxBlockOffset((getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getOffset() 
                    + getChmItsfHeader().getDataOffset()));
            setLzxBlockLength(getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()).getLength());

            setLzxBlocksCache(new ArrayList());

        } catch (IOException e) {
            LOG.warn("IOException parsing chm file", e);
        }
    }

    /**
     * Enumerates chm entities
     * 
     * @return list of chm entities
     */
    public List enumerateChm() {
        List listOfEntries = new ArrayList();
        for (DirectoryListingEntry directoryListingEntry : getChmDirList().getDirectoryListingEntryList()) {
            listOfEntries.add(directoryListingEntry.getName());
        }
        return listOfEntries;
    }

    /**
     * Decompresses a chm entry
     * 
     * @param directoryListingEntry
     * 
     * @return decompressed data
     * @throws TikaException 
     */
    public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) throws TikaException {
        ByteArrayOutputStream buffer = new ByteArrayOutputStream();
        ChmLzxBlock lzxBlock = null;
        try {
            /* UNCOMPRESSED type is easiest one */
            if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED
                    && directoryListingEntry.getLength() > 0
                    && !ChmCommons.hasSkip(directoryListingEntry)) {
                int dataOffset = (int) (getChmItsfHeader().getDataOffset() + directoryListingEntry
                        .getOffset());
                // dataSegment = Arrays.copyOfRange(getData(), dataOffset,
                // dataOffset + directoryListingEntry.getLength());
                buffer.write(ChmCommons.copyOfRange(
                        getData(), dataOffset,
                        dataOffset + directoryListingEntry.getLength()));
            } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED
                    && !ChmCommons.hasSkip(directoryListingEntry)) {
                /* Gets a chm hit_cache info */
                chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(
                        directoryListingEntry, (int) getChmLzxcResetTable()
                                .getBlockLen(), getChmLzxcControlData(), chmBlockInfo);

                int i = 0, start = 0, hit_cache = 0;

                if ((getLzxBlockLength() < Integer.MAX_VALUE)
                        && (getLzxBlockOffset() < Integer.MAX_VALUE)) {
                    // TODO: Improve the caching
                    // caching ... = O(n^2) - depends on startBlock and endBlock
                    start = -1;
                    if (!getLzxBlocksCache().isEmpty()) {
                        for (i = 0; i < getLzxBlocksCache().size(); i++) {
                            //lzxBlock = getLzxBlocksCache().get(i);
                            int bn = getLzxBlocksCache().get(i).getBlockNumber();
                            for (int j = chmBlockInfo.getIniBlock(); j <= chmBlockInfo.getStartBlock(); j++) {
                                if (bn == j) {
                                    if (j > start) {
                                        start = j;
                                        hit_cache = i;
                                    }
                                }
                            }
                            if (start == chmBlockInfo.getStartBlock())
                                break;
                        }
                    }

//                    if (i == getLzxBlocksCache().size() && i == 0) {
                    if (start<0) {
                        start = chmBlockInfo.getIniBlock();

                        byte[] dataSegment = ChmCommons.getChmBlockSegment(
                                getData(),
                                getChmLzxcResetTable(), start,
                                (int) getLzxBlockOffset(),
                                (int) getLzxBlockLength());

                        lzxBlock = new ChmLzxBlock(start, dataSegment,
                                getChmLzxcResetTable().getBlockLen(), null);

                        getLzxBlocksCache().add(lzxBlock);
                    } else {
                        lzxBlock = getLzxBlocksCache().get(hit_cache);
                    }

                    for (i = start; i <= chmBlockInfo.getEndBlock();) {
                        if (i == chmBlockInfo.getStartBlock() && i == chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent(
                                    chmBlockInfo.getStartOffset(), chmBlockInfo.getEndOffset()));
                            break;
                        }

                        if (i == chmBlockInfo.getStartBlock()) {
                            buffer.write(lzxBlock.getContent(
                                    chmBlockInfo.getStartOffset()));
                        }

                        if (i > chmBlockInfo.getStartBlock() && i < chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent());
                        }

                        if (i == chmBlockInfo.getEndBlock()) {
                            buffer.write(lzxBlock.getContent(
                                    0, chmBlockInfo.getEndOffset()));
                            break;
                        }

                        i++;

                        if (i % getChmLzxcControlData().getResetInterval() == 0) {
                            lzxBlock = new ChmLzxBlock(i,
                                    ChmCommons.getChmBlockSegment(getData(),
                                            getChmLzxcResetTable(), i,
                                            (int) getLzxBlockOffset(),
                                            (int) getLzxBlockLength()),
                                    getChmLzxcResetTable().getBlockLen(), null);
                        } else {
                            lzxBlock = new ChmLzxBlock(i,
                                    ChmCommons.getChmBlockSegment(getData(),
                                            getChmLzxcResetTable(), i,
                                            (int) getLzxBlockOffset(),
                                            (int) getLzxBlockLength()),
                                    getChmLzxcResetTable().getBlockLen(),
                                    lzxBlock);
                        }

                        getLzxBlocksCache().add(lzxBlock);
                    }

                    if (getLzxBlocksCache().size() > getChmLzxcResetTable()
                            .getBlockCount()) {
                        getLzxBlocksCache().clear();
                    }
                } //end of if
                
                if (buffer.size() != directoryListingEntry.getLength()) {
                    throw new TikaException("CHM file extract error: extracted Length is wrong.");
                }
            } //end of if compressed
        } catch (Exception e) {
            throw new TikaException(e.getMessage());
        }

        return buffer.toByteArray();
    }

    private void setLzxBlocksCache(List lzxBlocksCache) {
        this.lzxBlocksCache = lzxBlocksCache;
    }

    private List getLzxBlocksCache() {
        return lzxBlocksCache;
    }

    private void setChmDirList(ChmDirectoryListingSet chmDirList) {
        this.chmDirList = chmDirList;
    }

    public ChmDirectoryListingSet getChmDirList() {
        return chmDirList;
    }

    private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) {
        this.chmItsfHeader = chmItsfHeader;
    }

    private ChmItsfHeader getChmItsfHeader() {
        return chmItsfHeader;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy