All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.wordperfect.WP5DocumentAreaExtractor Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.wordperfect;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.collections4.MapUtils;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;

/**
 * Extracts WordPerfect Document Area text from a WordPerfect document
 * version 5.x.
 * @author Pascal Essiembre
 */
class WP5DocumentAreaExtractor extends WPDocumentAreaExtractor {
    
    /* 192-207 characters represent fixed-length multi-byte functions.  
     * Those that are not handled explicitely in the code below should be
     * skipped according to their size (minus the first char if already read).
     */
    private static final Map FIXED_LENGTH_FUNCTION_SIZES = 
            MapUtils.putAll(new HashMap(), new Integer[] {
        192, 4,  // Extended character
        193, 9,  // Center/Align/ Tab/Left Margin Release
        194, 11, // Indent
        195, 3,  // Attribute ON
        196, 3,  // Attribute OFF
        197, 5,  // Block Protect
        198, 6,  // End of Indent
        199, 7,  // Different Display Character when Hyphenated
        200, 4,  // (Reserved)
        201, 5,  // (Reserved)
        202, 6,  // (Reserved)
        203, 6,  // (Reserved)
        204, 8,  // (Reserved)
        205, 10, // (Reserved)
        206, 10, // (Reserved)
        207, 12, // (Reserved)
    });
    
    protected void extract(int c, WPInputStream in, StringBuilder out, 
            XHTMLContentHandler xhtml) throws IOException, SAXException {

        // 0-31: control characters
        if (c == 10) {
            endParagraph(out, xhtml);// hard return ("Enter")
        } else if (c == 11) {
            out.append(' ');      // soft page break
        } else if (c == 12) {
            endParagraph(out, xhtml);// hard page break
        } else if (c == 13) {
            out.append(' ');      // soft return (line wrap)
            
        // 32-126: ASCII characters
        } else if (c >= 32 && c <= 126) {
            out.append((char) c); // ASCII character

        // 128-191: single-byte functions
        } else if (c == 140) {
            endParagraph(out, xhtml);// combination hard return/soft page (WP5.1)
        } else if (c >= 144 && c <= 149) {
            out.append(' ');      // deletable/invisible soft return/page
        } else if (c == 153) {
            endParagraph(out, xhtml);// Dormant Hard return (WP5.1)
        } else if (c == 160) {
            out.append('\u00A0'); // Hard space
        } else if (c >= 169 && c <= 171) {
            out.append('-');      // Hard hyphen
            
        // 192-207: fixed-length multi-byte function
        } else if (c == 192) {
            // extended char
            int charval = in.readWP();
            int charset = in.readWP();
            in.readWP(); // closing character
            WP5Charsets.append(out, charset, charval);
        } else if (c >= 193 && c <= 207) {
            // removing 1 from function length since first char already read
            in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
            
        // 208-255: variable-length multi-byte function
        } else if (c >= 208 && c <= 255) {
            // Variable-Length Multi-Byte Functions
            in.readWP(); // subgroup (the function code)
            int functionSize = in.readWPShort();
            for (int i = 0; i < functionSize; i++) {
                in.readWP();
            }
            //TODO Are there functions containing data? Like footnotes?
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy