org.apache.tika.parser.wordperfect.WP5DocumentAreaExtractor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.wordperfect;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.collections4.MapUtils;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
/**
* Extracts WordPerfect Document Area text from a WordPerfect document
* version 5.x.
* @author Pascal Essiembre
*/
class WP5DocumentAreaExtractor extends WPDocumentAreaExtractor {
/* 192-207 characters represent fixed-length multi-byte functions.
* Those that are not handled explicitely in the code below should be
* skipped according to their size (minus the first char if already read).
*/
private static final Map FIXED_LENGTH_FUNCTION_SIZES =
MapUtils.putAll(new HashMap(), new Integer[] {
192, 4, // Extended character
193, 9, // Center/Align/ Tab/Left Margin Release
194, 11, // Indent
195, 3, // Attribute ON
196, 3, // Attribute OFF
197, 5, // Block Protect
198, 6, // End of Indent
199, 7, // Different Display Character when Hyphenated
200, 4, // (Reserved)
201, 5, // (Reserved)
202, 6, // (Reserved)
203, 6, // (Reserved)
204, 8, // (Reserved)
205, 10, // (Reserved)
206, 10, // (Reserved)
207, 12, // (Reserved)
});
protected void extract(int c, WPInputStream in, StringBuilder out,
XHTMLContentHandler xhtml) throws IOException, SAXException {
// 0-31: control characters
if (c == 10) {
endParagraph(out, xhtml);// hard return ("Enter")
} else if (c == 11) {
out.append(' '); // soft page break
} else if (c == 12) {
endParagraph(out, xhtml);// hard page break
} else if (c == 13) {
out.append(' '); // soft return (line wrap)
// 32-126: ASCII characters
} else if (c >= 32 && c <= 126) {
out.append((char) c); // ASCII character
// 128-191: single-byte functions
} else if (c == 140) {
endParagraph(out, xhtml);// combination hard return/soft page (WP5.1)
} else if (c >= 144 && c <= 149) {
out.append(' '); // deletable/invisible soft return/page
} else if (c == 153) {
endParagraph(out, xhtml);// Dormant Hard return (WP5.1)
} else if (c == 160) {
out.append('\u00A0'); // Hard space
} else if (c >= 169 && c <= 171) {
out.append('-'); // Hard hyphen
// 192-207: fixed-length multi-byte function
} else if (c == 192) {
// extended char
int charval = in.readWP();
int charset = in.readWP();
in.readWP(); // closing character
WP5Charsets.append(out, charset, charval);
} else if (c >= 193 && c <= 207) {
// removing 1 from function length since first char already read
in.skipWPByte(FIXED_LENGTH_FUNCTION_SIZES.get(c) - 1);
// 208-255: variable-length multi-byte function
} else if (c >= 208 && c <= 255) {
// Variable-Length Multi-Byte Functions
in.readWP(); // subgroup (the function code)
int functionSize = in.readWPShort();
for (int i = 0; i < functionSize; i++) {
in.readWP();
}
//TODO Are there functions containing data? Like footnotes?
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy