All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.parser.wordperfect.WP5Charsets Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.wordperfect;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * WordPerfect 5.x constant values used for mapping WordPerfect charsets to
 * unicode equivalents when possible.
 * @author Pascal Essiembre
 */
final class WP5Charsets {
    private static final Logger LOG = LoggerFactory.getLogger(WP5Charsets.class);

    /**
     * Extended character sets used when fixed-length multi-byte functions
     * with a byte value of 192 (0xC0) are found in a WordPerfect document.
     * Those character set codes may be specific to WordPerfect 
     * file specifications and may or may not be considered standard 
     * outside WordPerfect. Applies to version 5.x.
     */
    public static final char[][] EXTENDED_CHARSETS = new char[][] {
        // WP Charset 0: ASCII (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[0],
        // WP Charset 1: Multinational 1 (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[1],
        // WP Charset 2: Multinational 2 (28 chars)
        {
        '\u0323','\u0324','\u02da','\u0325','\u02bc','\u032d','\u2017','\u005f',
        '\u0138','\u032e','\u033e','\u2018','\u0020','\u02bd','\u02db','\u0327',
        '\u0321','\u0322','\u030d','\u2019','\u0329','\u0020','\u0621','\u02be',
        '\u0306','\u0310','\u2032','\u2034'            
        },
        // WP Charset 3: Box Drawing (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[3],
        // WP Charset 4: Typographic Symbols (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[4],
        // WP Charset 5: Iconic Symbol (35 chars)
        {
        '\u2665','\u2666','\u2663','\u2660','\u2642','\u2640','\u263c','\u263a',
        '\u263b','\u266a','\u266c','\u25ac','\u2302','\u203c','\u221a','\u21a8',
        '\u2310','\u2319','\u25d8','\u25d9','\u21b5','\u261e','\u261c','\u2713',
        '\u2610','\u2612','\u2639','\u266f','\u266d','\u266e','\u260e','\u231a',
        '\u231b','\u2104','\u23b5'            
        },
        // WP Charset 6: Math/Scientific (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[6],
        // WP Charset 7 Math/Scientific Extended (same as WP6)
        WP6Charsets.EXTENDED_CHARSETS[7],
        // WP Charset 8: Greek (210 chars)
        {
        '\u0391','\u03b1','\u0392','\u03b2','\u0392','\u03d0','\u0393','\u03b3',
        '\u0394','\u03b4','\u0395','\u03b5','\u0396','\u03b6','\u0397','\u03b7',
        '\u0398','\u03b8','\u0399','\u03b9','\u039a','\u03ba','\u039b','\u03bb',
        '\u039c','\u03bc','\u039d','\u03bd','\u039e','\u03be','\u039f','\u03bf',
        '\u03a0','\u03c0','\u03a1','\u03c1','\u03a3','\u03c3','\u03f9','\u03db',
        '\u03a4','\u03c4','\u03a5','\u03c5','\u03a6','\u03d5','\u03a7','\u03c7',
        '\u03a8','\u03c8','\u03a9','\u03c9','\u03ac','\u03ad','\u03ae','\u03af',
        '\u03ca','\u03cc','\u03cd','\u03cb','\u03ce','\u03b5','\u03d1','\u03f0',
        '\u03d6','\u1fe5','\u03d2','\u03c6','\u03c9','\u037e','\u0387','\u0384',
        '\u00a8','\u0385','\u1fed','\u1fef','\u1fc0','\u1fbd','\u1fbf','\u1fbe',
        '\u1fce','\u1fde','\u1fcd','\u1fdd','\u1fcf','\u1fdf','\u0384','\u1fef',
        '\u1fc0','\u1fbd','\u1fbf','\u1fce','\u1fde','\u1fcd','\u1fdd','\u1fcf',
        '\u1fdf','\u1f70','\u1fb6','\u1fb3','\u1fb4','\u1fb7','\u1f00','\u1f04',
        '\u1f02','\u1f06','\u1f80','\u1f84','\u1f86','\u1f01','\u1f05','\u1f03',
        '\u1f07','\u1f81','\u1f85','\u1f87','\u1f72','\u1f10','\u1f14','\u1f13',
        '\u1f11','\u1f15','\u1f13','\u1f74','\u1fc6','\u1fc3','\u1fc4','\u1fc2',
        '\u1fc7','\u1f20','\u1f24','\u1f22','\u1f26','\u1f90','\u1f94','\u1f96',
        '\u1f21','\u1f25','\u1f23','\u1f27','\u1f91','\u1f95','\u1f97','\u1f76',
        '\u1fd6','\u0390','\u1fd2','\u1f30','\u1f34','\u1f32','\u1f36','\u1f31',
        '\u1f35','\u1f33','\u1f37','\u1f78','\u1f40','\u1f44','\u1f42','\u1f41',
        '\u1f45','\u1f43','\u1f7a','\u1fe6','\u03b0','\u1fe3','\u1f50','\u1f54',
        '\u1f52','\u1f56','\u1f51','\u1f55','\u1f53','\u1f57','\u1f7c','\u1ff6',
        '\u1ff3','\u1ff4','\u1ff2','\u1ff7','\u1f60','\u1f64','\u1f62','\u1f66',
        '\u1fa0','\u1fa4','\u1fa6','\u1f61','\u1f65','\u1f63','\u1f67','\u1fa1',
        '\u1fa5','\u1fa7','\u0374','\u0375','\u03db','\u03dd','\u03d9','\u03e1',
        '\u0386','\u0388','\u0389','\u038a','\u038c','\u038e','\u038f','\u03aa',
        '\u03ab','\u1fe5'
        },
        // WP Charset 9: Hebrew (119 chars)
        {
        '\u05d0','\u05d1','\u05d2','\u05d3','\u05d4','\u05d5','\u05d6','\u05d7',
        '\u05d8','\u05d9','\u05da','\u05db','\u05dc','\u05dd','\u05de','\u05df',
        '\u05e0','\u05e1','\u05e2','\u05e3','\u05e4','\u05e5','\u05e6','\u05e7',
        '\u05e8','\u05e9','\u05ea','\u05be','\u05c0','\u05c3','\u05f3','\u05f4',
        '\u05b0','\u05b1','\u05b2','\u05b3','\u05b4','\u05b5','\u05b6','\u05b7',
        '\u05b8','\u05b9','\u05ba','\u05bb','\u05bc','\u05bd','\u05bf','\u05b7',
        '\ufbe1','\u05f0','\u05f1','\u05f2','\u0591','\u0596','\u05ad','\u05a4',
        '\u059a','\u059b','\u05a3','\u05a5','\u05a6','\u05a7','\u09aa','\u0592',
        '\u0593','\u0594','\u0595','\u0597','\u0598','\u0599','\u05a8','\u059c',
        '\u059d','\u059e','\u05a1','\u05a9','\u05a0','\u059f','\u05ab','\u05ac',
        '\u05af','\u05c4','\u0544','\u05d0','\ufb31','\ufb32','\ufb33','\ufb34',
        '\ufb35','\ufb4b','\ufb36','\u05d7','\ufb38','\ufb39','\ufb3b','\ufb3a',
        '\u05da','\u05da','\u05da','\u05da','\u05da','\u05da','\ufb3c','\ufb3e',
        '\ufb40','\u05df','\ufb41','\ufb44','\ufb46','\ufb47','\ufb2b','\ufb2d',
        '\ufb2a','\ufb2c','\ufb4a','\ufb4c','\ufb4e','\ufb1f','\ufb1d'
        },
        // WP Charset 10: Cyrillic (150 chars)
        {
        '\u0410','\u0430','\u0411','\u0431','\u0412','\u0432','\u0413','\u0433',
        '\u0414','\u0434','\u0415','\u0435','\u0401','\u0451','\u0416','\u0436',
        '\u0417','\u0437','\u0418','\u0438','\u0419','\u0439','\u041a','\u043a',
        '\u041b','\u043b','\u041c','\u043c','\u041d','\u043d','\u041e','\u043e',
        '\u041f','\u043f','\u0420','\u0440','\u0421','\u0441','\u0422','\u0442',
        '\u0423','\u0443','\u0424','\u0444','\u0425','\u0445','\u0426','\u0446',
        '\u0427','\u0447','\u0428','\u0448','\u0429','\u0449','\u042a','\u044a',
        '\u042b','\u044b','\u042c','\u044c','\u042d','\u044d','\u042e','\u044e',
        '\u042f','\u044f','\u0490','\u0491','\u0402','\u0452','\u0403','\u0453',
        '\u0404','\u0454','\u0405','\u0455','\u0406','\u0456','\u0407','\u0457',
        '\u0408','\u0458','\u0409','\u0459','\u040a','\u045a','\u040b','\u045b',
        '\u040c','\u045c','\u040e','\u045e','\u040f','\u045f','\u0462','\u0463',
        '\u0472','\u0473','\u0474','\u0475','\u046a','\u046b','\ua640','\ua641',
        '\u0429','\u0449','\u04c0','\u04cf','\u0466','\u0467','\u0000','\u0000',
        '\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000',
        '\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000',
        '\u0000','\u0000','\u0400','\u0450','\u0000','\u0000','\u040d','\u045d',
        '\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000','\u0000',
        '\u0000','\u0000','\u0000','\u0000','\u0301','\u0300'
        },
        // WP Charset 11: Japanese (185 chars)
        {
        '\u3041','\u3043','\u3045','\u3047','\u3049','\u3053','\u3083','\u3085',
        '\u3087','\u3094','\u3095','\u3096','\u3042','\u3044','\u3046','\u3048',
        '\u304a','\u304b','\u304d','\u3047','\u3051','\u3053','\u304c','\u304e',
        '\u3050','\u3052','\u3054','\u3055','\u3057','\u3059','\u305b','\u305d',
        '\u3056','\u3058','\u305a','\u305c','\u305e','\u305f','\u3051','\u3064',
        '\u3066','\u3068','\u3060','\u3062','\u3065','\u3067','\u3069','\u306a',
        '\u306b','\u306c','\u306d','\u306e','\u306f','\u3072','\u3075','\u3078',
        '\u307b','\u3070','\u3073','\u3076','\u3079','\u307c','\u3071','\u3074',
        '\u3077','\u307a','\u307d','\u307e','\u307f','\u3080','\u3081','\u3082',
        '\u3084','\u3086','\u3088','\u3089','\u308a','\u308b','\u308c','\u308d',
        '\u308e','\u3092','\u3093','\u3014','\u3015','\uff3b','\uff3d','\u300c',
        '\u300d','\u300c','\u300d','\u302a','\u3002','\u3001','\u309d','\u309e',
        '\u3003','\u30fc','\u309b','\u309c','\u30a1','\u30a3','\u30a5','\u30a7',
        '\u30a9','\u30c3','\u30e3','\u30e5','\u3057','\u30f4','\u30f5','\u30f6',
        '\u30a2','\u30a4','\u30a6','\u30a8','\u30aa','\u30ab','\u30ad','\u30af',
        '\u30b1','\u30b3','\u30ac','\u30ae','\u30b0','\u30b2','\u30b4','\u30b5',
        '\u30c4','\u30b9','\u30bb','\u30bd','\u30b6','\u30b8','\u30ba','\u30bc',
        '\u30be','\u30bf','\u30c1','\u30c4','\u30c6','\u30c8','\u30c0','\u30c2',
        '\u30c5','\u30c7','\u30c9','\u30ca','\u30cb','\u30cc','\u30cd','\u30ce',
        '\u30cf','\u30d2','\u30d5','\u30d8','\u03d0','\u30db','\u30d3','\u30d6',
        '\u30d9','\u30dc','\u30d1','\u30d4','\u30d7','\u30da','\u30dd','\u30de',
        '\u30df','\u30e0','\u30e1','\u30e2','\u30e4','\u30e6','\u30e8','\u30e9',
        '\u30ea','\u30ab','\u30ec','\u30ed','\u30ef','\u30f2','\u30f3','\u30fd',
        '\u30fe'
        },
        // WP Charset 12: User-defined (255 chars)
        {  
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',
        ' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' ',' '
        }
    }; 

    //TODO map multi-characters
    
    /**
     * Constructor.
     */
    private WP5Charsets() {
    }

    public static void append(StringBuilder out, int charset, int charval) {
        if (charset >= WP5Charsets.EXTENDED_CHARSETS.length) {
            LOG.debug("Unsupported WordPerfect 5.x charset: {}", charset);
            out.append(' ');
        } else if (charval >= WP5Charsets.EXTENDED_CHARSETS[charset].length) {
            LOG.debug("Unsupported WordPerfect 5.x charset ({}) character value: {}", charset, charval);
            out.append(' ');
        } else {
            out.append(WP5Charsets.EXTENDED_CHARSETS[charset][charval]);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy