All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.wordperfect.WP5Charsets Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.wordperfect;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * WordPerfect 5.x constant values used for mapping WordPerfect charsets to
 * unicode equivalents when possible.
 *
 * @author Pascal Essiembre
 */
final class WP5Charsets {
    /**
     * Extended character sets used when fixed-length multi-byte functions
     * with a byte value of 192 (0xC0) are found in a WordPerfect document.
     * Those character set codes may be specific to WordPerfect
     * file specifications and may or may not be considered standard
     * outside WordPerfect. Applies to version 5.x.
     */
    public static final char[][] EXTENDED_CHARSETS = new char[][]{
            // WP Charset 0: ASCII (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[0],
            // WP Charset 1: Multinational 1 (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[1],
            // WP Charset 2: Multinational 2 (28 chars)
            {'\u0323', '\u0324', '\u02da', '\u0325', '\u02bc', '\u032d', '\u2017', '\u005f',
                    '\u0138', '\u032e', '\u033e', '\u2018', '\u0020', '\u02bd', '\u02db', '\u0327',
                    '\u0321', '\u0322', '\u030d', '\u2019', '\u0329', '\u0020', '\u0621', '\u02be',
                    '\u0306', '\u0310', '\u2032', '\u2034'},
            // WP Charset 3: Box Drawing (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[3],
            // WP Charset 4: Typographic Symbols (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[4],
            // WP Charset 5: Iconic Symbol (35 chars)
            {'\u2665', '\u2666', '\u2663', '\u2660', '\u2642', '\u2640', '\u263c', '\u263a',
                    '\u263b', '\u266a', '\u266c', '\u25ac', '\u2302', '\u203c', '\u221a', '\u21a8',
                    '\u2310', '\u2319', '\u25d8', '\u25d9', '\u21b5', '\u261e', '\u261c', '\u2713',
                    '\u2610', '\u2612', '\u2639', '\u266f', '\u266d', '\u266e', '\u260e', '\u231a',
                    '\u231b', '\u2104', '\u23b5'},
            // WP Charset 6: Math/Scientific (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[6],
            // WP Charset 7 Math/Scientific Extended (same as WP6)
            WP6Charsets.EXTENDED_CHARSETS[7],
            // WP Charset 8: Greek (210 chars)
            {'\u0391', '\u03b1', '\u0392', '\u03b2', '\u0392', '\u03d0', '\u0393', '\u03b3',
                    '\u0394', '\u03b4', '\u0395', '\u03b5', '\u0396', '\u03b6', '\u0397', '\u03b7',
                    '\u0398', '\u03b8', '\u0399', '\u03b9', '\u039a', '\u03ba', '\u039b', '\u03bb',
                    '\u039c', '\u03bc', '\u039d', '\u03bd', '\u039e', '\u03be', '\u039f', '\u03bf',
                    '\u03a0', '\u03c0', '\u03a1', '\u03c1', '\u03a3', '\u03c3', '\u03f9', '\u03db',
                    '\u03a4', '\u03c4', '\u03a5', '\u03c5', '\u03a6', '\u03d5', '\u03a7', '\u03c7',
                    '\u03a8', '\u03c8', '\u03a9', '\u03c9', '\u03ac', '\u03ad', '\u03ae', '\u03af',
                    '\u03ca', '\u03cc', '\u03cd', '\u03cb', '\u03ce', '\u03b5', '\u03d1', '\u03f0',
                    '\u03d6', '\u1fe5', '\u03d2', '\u03c6', '\u03c9', '\u037e', '\u0387', '\u0384',
                    '\u00a8', '\u0385', '\u1fed', '\u1fef', '\u1fc0', '\u1fbd', '\u1fbf', '\u1fbe',
                    '\u1fce', '\u1fde', '\u1fcd', '\u1fdd', '\u1fcf', '\u1fdf', '\u0384', '\u1fef',
                    '\u1fc0', '\u1fbd', '\u1fbf', '\u1fce', '\u1fde', '\u1fcd', '\u1fdd', '\u1fcf',
                    '\u1fdf', '\u1f70', '\u1fb6', '\u1fb3', '\u1fb4', '\u1fb7', '\u1f00', '\u1f04',
                    '\u1f02', '\u1f06', '\u1f80', '\u1f84', '\u1f86', '\u1f01', '\u1f05', '\u1f03',
                    '\u1f07', '\u1f81', '\u1f85', '\u1f87', '\u1f72', '\u1f10', '\u1f14', '\u1f13',
                    '\u1f11', '\u1f15', '\u1f13', '\u1f74', '\u1fc6', '\u1fc3', '\u1fc4', '\u1fc2',
                    '\u1fc7', '\u1f20', '\u1f24', '\u1f22', '\u1f26', '\u1f90', '\u1f94', '\u1f96',
                    '\u1f21', '\u1f25', '\u1f23', '\u1f27', '\u1f91', '\u1f95', '\u1f97', '\u1f76',
                    '\u1fd6', '\u0390', '\u1fd2', '\u1f30', '\u1f34', '\u1f32', '\u1f36', '\u1f31',
                    '\u1f35', '\u1f33', '\u1f37', '\u1f78', '\u1f40', '\u1f44', '\u1f42', '\u1f41',
                    '\u1f45', '\u1f43', '\u1f7a', '\u1fe6', '\u03b0', '\u1fe3', '\u1f50', '\u1f54',
                    '\u1f52', '\u1f56', '\u1f51', '\u1f55', '\u1f53', '\u1f57', '\u1f7c', '\u1ff6',
                    '\u1ff3', '\u1ff4', '\u1ff2', '\u1ff7', '\u1f60', '\u1f64', '\u1f62', '\u1f66',
                    '\u1fa0', '\u1fa4', '\u1fa6', '\u1f61', '\u1f65', '\u1f63', '\u1f67', '\u1fa1',
                    '\u1fa5', '\u1fa7', '\u0374', '\u0375', '\u03db', '\u03dd', '\u03d9', '\u03e1',
                    '\u0386', '\u0388', '\u0389', '\u038a', '\u038c', '\u038e', '\u038f', '\u03aa',
                    '\u03ab', '\u1fe5'},
            // WP Charset 9: Hebrew (119 chars)
            {'\u05d0', '\u05d1', '\u05d2', '\u05d3', '\u05d4', '\u05d5', '\u05d6', '\u05d7',
                    '\u05d8', '\u05d9', '\u05da', '\u05db', '\u05dc', '\u05dd', '\u05de', '\u05df',
                    '\u05e0', '\u05e1', '\u05e2', '\u05e3', '\u05e4', '\u05e5', '\u05e6', '\u05e7',
                    '\u05e8', '\u05e9', '\u05ea', '\u05be', '\u05c0', '\u05c3', '\u05f3', '\u05f4',
                    '\u05b0', '\u05b1', '\u05b2', '\u05b3', '\u05b4', '\u05b5', '\u05b6', '\u05b7',
                    '\u05b8', '\u05b9', '\u05ba', '\u05bb', '\u05bc', '\u05bd', '\u05bf', '\u05b7',
                    '\ufbe1', '\u05f0', '\u05f1', '\u05f2', '\u0591', '\u0596', '\u05ad', '\u05a4',
                    '\u059a', '\u059b', '\u05a3', '\u05a5', '\u05a6', '\u05a7', '\u09aa', '\u0592',
                    '\u0593', '\u0594', '\u0595', '\u0597', '\u0598', '\u0599', '\u05a8', '\u059c',
                    '\u059d', '\u059e', '\u05a1', '\u05a9', '\u05a0', '\u059f', '\u05ab', '\u05ac',
                    '\u05af', '\u05c4', '\u0544', '\u05d0', '\ufb31', '\ufb32', '\ufb33', '\ufb34',
                    '\ufb35', '\ufb4b', '\ufb36', '\u05d7', '\ufb38', '\ufb39', '\ufb3b', '\ufb3a',
                    '\u05da', '\u05da', '\u05da', '\u05da', '\u05da', '\u05da', '\ufb3c', '\ufb3e',
                    '\ufb40', '\u05df', '\ufb41', '\ufb44', '\ufb46', '\ufb47', '\ufb2b', '\ufb2d',
                    '\ufb2a', '\ufb2c', '\ufb4a', '\ufb4c', '\ufb4e', '\ufb1f', '\ufb1d'},
            // WP Charset 10: Cyrillic (150 chars)
            {'\u0410', '\u0430', '\u0411', '\u0431', '\u0412', '\u0432', '\u0413', '\u0433',
                    '\u0414', '\u0434', '\u0415', '\u0435', '\u0401', '\u0451', '\u0416', '\u0436',
                    '\u0417', '\u0437', '\u0418', '\u0438', '\u0419', '\u0439', '\u041a', '\u043a',
                    '\u041b', '\u043b', '\u041c', '\u043c', '\u041d', '\u043d', '\u041e', '\u043e',
                    '\u041f', '\u043f', '\u0420', '\u0440', '\u0421', '\u0441', '\u0422', '\u0442',
                    '\u0423', '\u0443', '\u0424', '\u0444', '\u0425', '\u0445', '\u0426', '\u0446',
                    '\u0427', '\u0447', '\u0428', '\u0448', '\u0429', '\u0449', '\u042a', '\u044a',
                    '\u042b', '\u044b', '\u042c', '\u044c', '\u042d', '\u044d', '\u042e', '\u044e',
                    '\u042f', '\u044f', '\u0490', '\u0491', '\u0402', '\u0452', '\u0403', '\u0453',
                    '\u0404', '\u0454', '\u0405', '\u0455', '\u0406', '\u0456', '\u0407', '\u0457',
                    '\u0408', '\u0458', '\u0409', '\u0459', '\u040a', '\u045a', '\u040b', '\u045b',
                    '\u040c', '\u045c', '\u040e', '\u045e', '\u040f', '\u045f', '\u0462', '\u0463',
                    '\u0472', '\u0473', '\u0474', '\u0475', '\u046a', '\u046b', '\ua640', '\ua641',
                    '\u0429', '\u0449', '\u04c0', '\u04cf', '\u0466', '\u0467', '\u0000', '\u0000',
                    '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000',
                    '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000',
                    '\u0000', '\u0000', '\u0400', '\u0450', '\u0000', '\u0000', '\u040d', '\u045d',
                    '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000', '\u0000',
                    '\u0000', '\u0000', '\u0000', '\u0000', '\u0301', '\u0300'},
            // WP Charset 11: Japanese (185 chars)
            {'\u3041', '\u3043', '\u3045', '\u3047', '\u3049', '\u3053', '\u3083', '\u3085',
                    '\u3087', '\u3094', '\u3095', '\u3096', '\u3042', '\u3044', '\u3046', '\u3048',
                    '\u304a', '\u304b', '\u304d', '\u3047', '\u3051', '\u3053', '\u304c', '\u304e',
                    '\u3050', '\u3052', '\u3054', '\u3055', '\u3057', '\u3059', '\u305b', '\u305d',
                    '\u3056', '\u3058', '\u305a', '\u305c', '\u305e', '\u305f', '\u3051', '\u3064',
                    '\u3066', '\u3068', '\u3060', '\u3062', '\u3065', '\u3067', '\u3069', '\u306a',
                    '\u306b', '\u306c', '\u306d', '\u306e', '\u306f', '\u3072', '\u3075', '\u3078',
                    '\u307b', '\u3070', '\u3073', '\u3076', '\u3079', '\u307c', '\u3071', '\u3074',
                    '\u3077', '\u307a', '\u307d', '\u307e', '\u307f', '\u3080', '\u3081', '\u3082',
                    '\u3084', '\u3086', '\u3088', '\u3089', '\u308a', '\u308b', '\u308c', '\u308d',
                    '\u308e', '\u3092', '\u3093', '\u3014', '\u3015', '\uff3b', '\uff3d', '\u300c',
                    '\u300d', '\u300c', '\u300d', '\u302a', '\u3002', '\u3001', '\u309d', '\u309e',
                    '\u3003', '\u30fc', '\u309b', '\u309c', '\u30a1', '\u30a3', '\u30a5', '\u30a7',
                    '\u30a9', '\u30c3', '\u30e3', '\u30e5', '\u3057', '\u30f4', '\u30f5', '\u30f6',
                    '\u30a2', '\u30a4', '\u30a6', '\u30a8', '\u30aa', '\u30ab', '\u30ad', '\u30af',
                    '\u30b1', '\u30b3', '\u30ac', '\u30ae', '\u30b0', '\u30b2', '\u30b4', '\u30b5',
                    '\u30c4', '\u30b9', '\u30bb', '\u30bd', '\u30b6', '\u30b8', '\u30ba', '\u30bc',
                    '\u30be', '\u30bf', '\u30c1', '\u30c4', '\u30c6', '\u30c8', '\u30c0', '\u30c2',
                    '\u30c5', '\u30c7', '\u30c9', '\u30ca', '\u30cb', '\u30cc', '\u30cd', '\u30ce',
                    '\u30cf', '\u30d2', '\u30d5', '\u30d8', '\u03d0', '\u30db', '\u30d3', '\u30d6',
                    '\u30d9', '\u30dc', '\u30d1', '\u30d4', '\u30d7', '\u30da', '\u30dd', '\u30de',
                    '\u30df', '\u30e0', '\u30e1', '\u30e2', '\u30e4', '\u30e6', '\u30e8', '\u30e9',
                    '\u30ea', '\u30ab', '\u30ec', '\u30ed', '\u30ef', '\u30f2', '\u30f3', '\u30fd',
                    '\u30fe'},
            // WP Charset 12: User-defined (255 chars)
            {' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
                    ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '}};
    private static final Logger LOG = LoggerFactory.getLogger(WP5Charsets.class);

    //TODO map multi-characters

    /**
     * Constructor.
     */
    private WP5Charsets() {
    }

    public static void append(StringBuilder out, int charset, int charval) {
        if (charset >= WP5Charsets.EXTENDED_CHARSETS.length) {
            LOG.debug("Unsupported WordPerfect 5.x charset: {}", charset);
            out.append(' ');
        } else if (charval >= WP5Charsets.EXTENDED_CHARSETS[charset].length) {
            LOG.debug("Unsupported WordPerfect 5.x charset ({}) character value: {}", charset,
                    charval);
            out.append(' ');
        } else {
            out.append(WP5Charsets.EXTENDED_CHARSETS[charset][charval]);
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy