All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jruby.ext.nkf.RubyNKF Maven / Gradle / Ivy

/***** BEGIN LICENSE BLOCK *****
 * Version: EPL 2.0/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Eclipse Public
 * License Version 2.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.eclipse.org/legal/epl-v20.html
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * Copyright (C) 2007-2011 Koichiro Ohba 
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the EPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the EPL, the GPL or the LGPL.
 ***** END LICENSE BLOCK *****/

package org.jruby.ext.nkf;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jcodings.transcode.EConv;
import org.jcodings.transcode.EConvFlags;
import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyModule;
import org.jruby.RubyString;

import org.jruby.anno.JRubyMethod;
import org.jruby.anno.JRubyModule;
import org.jruby.runtime.Helpers;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.jruby.util.KCode;
import org.jruby.util.Pack;
import org.jruby.util.io.EncodingUtils;

@JRubyModule(name="NKF")
public class RubyNKF {
    public static enum NKFCharset {
        AUTO(0, "x-JISAutoDetect"),
        // no ISO-2022-JP in jcodings
        JIS(1, "ISO-2022-JP"),
        EUC(2, "EUC-JP"),
        SJIS(3, "Shift_JIS"),
        BINARY(4, null),
        NOCONV(4, null),
        UNKNOWN(0, null),
        ASCII(5, "iso-8859-1"),
        UTF8(6, "UTF-8"),
        UTF16(8, "UTF-16"),
        UTF32(12, "UTF-32"),
        OTHER(16, null),
        BASE64(20, "base64"),
        QENCODE(21, "qencode"),
        MIME_DETECT(22, "MimeAutoDetect");

        private NKFCharset(int value, String charset) {
            this.value = value;
            this.charset = charset;
        }

        public int getValue() {
            return value;
        }

        public String getCharset() {
            return charset;
        }

        private final int value;
        private final String charset;
    }

    private static final ByteList BEGIN_MIME_STRING = new ByteList(ByteList.plain("=?"));
    private static final ByteList END_MIME_STRING = new ByteList(ByteList.plain("?="));
    private static final ByteList PACK_BASE64 = new ByteList(ByteList.plain("m"));
    private static final ByteList PACK_QENCODE = new ByteList(ByteList.plain("M"));

    public static final Map NKFCharsetMap = new HashMap(20, 1);

    public static void createNKF(Ruby runtime) {
        final RubyModule NKF = runtime.defineModule("NKF");
        final String version = "2.1.2";
        final String relDate = "2011-09-08";

        NKF.defineConstant("NKF_VERSION", runtime.newString(version));
        NKF.defineConstant("NKF_RELEASE_DATE", runtime.newString(relDate));
        NKF.defineConstant("VERSION", runtime.newString(version + ' ' + '(' + "JRuby" + '_' + relDate + ')'));

        for ( NKFCharset charset : NKFCharset.values() ) {
            NKFCharsetMap.put(charset.value, charset.name());

            if (charset.value > 12 ) continue;
            NKF.defineConstant(charset.name(), charsetMappedValue(runtime, charset));
        }

        NKF.defineAnnotatedMethods(RubyNKF.class);
    }

    @JRubyMethod(name = "guess", required = 1, module = true)
    public static IRubyObject guess(ThreadContext context, IRubyObject recv, IRubyObject s) {
        return charsetMappedValue(context.runtime, guess(context, s));
    }

    public static NKFCharset guess(ThreadContext context, IRubyObject s) {
        // TODO: Fix charset usage for JRUBY-4553
        Ruby runtime = context.runtime;
        if (!s.respondsTo("to_str")) {
            throw runtime.newTypeError("can't convert " + s.getMetaClass() + " into String");
        }
        ByteList bytes = s.convertToString().getByteList();
        ByteBuffer buf = ByteBuffer.wrap(bytes.getUnsafeBytes(), bytes.begin(), bytes.length());
        CharsetDecoder decoder;
        try {
            decoder = Charset.forName("x-JISAutoDetect").newDecoder();
        } catch (UnsupportedCharsetException e) {
            throw runtime.newStandardError("charsets.jar is required to use NKF#guess. Please install JRE which supports m17n.");
        }
        try {
            decoder.decode(buf);

            if ( ! decoder.isCharsetDetected() ) {
                return NKFCharset.UNKNOWN;
            }
            Charset charset = decoder.detectedCharset();
            String name = charset.name();
            if ("Shift_JIS".equals(name)) {
                return NKFCharset.SJIS;
            }
            if ("Windows-31j".equalsIgnoreCase(name)) {
                return NKFCharset.JIS;
            }
            if ("EUC-JP".equals(name)) {
                return NKFCharset.EUC;
            }
            if ("ISO-2022-JP".equals(name)) {
                return NKFCharset.JIS;
            }
        }
        catch (CharacterCodingException e) {
            // fall through and try direct encoding
        }

        if (bytes.getEncoding() == UTF8Encoding.INSTANCE) {
            return NKFCharset.UTF8;
        }
        if (bytes.getEncoding().toString().startsWith("UTF-16")) {
            return NKFCharset.UTF16;
        }
        if (bytes.getEncoding().toString().startsWith("UTF-32")) {
            return NKFCharset.UTF32;
        }
        return NKFCharset.UNKNOWN;
    }

    private static IRubyObject charsetMappedValue(final Ruby runtime, final NKFCharset charset) {
        final Encoding encoding;
        switch (charset) {
            case AUTO: case NOCONV: case UNKNOWN: return runtime.getNil();
            case BINARY:
                encoding = runtime.getEncodingService().getAscii8bitEncoding();
                return runtime.getEncodingService().convertEncodingToRubyEncoding(encoding);
        }

        encoding = runtime.getEncodingService().getEncodingFromString(charset.getCharset());
        return runtime.getEncodingService().convertEncodingToRubyEncoding(encoding);
    }

    @JRubyMethod(name = "guess1", required = 1, module = true)
    public static IRubyObject guess1(ThreadContext context, IRubyObject recv, IRubyObject str) {
        return guess(context, recv, str);
    }

    @JRubyMethod(name = "guess2", required = 1, module = true)
    public static IRubyObject guess2(ThreadContext context, IRubyObject recv, IRubyObject str) {
        return guess(context, recv, str);
    }

    @JRubyMethod(name = "nkf", required = 2, module = true)
    public static IRubyObject nkf(ThreadContext context, IRubyObject recv, IRubyObject opt, IRubyObject str) {
        Ruby runtime = context.runtime;

        if (!opt.respondsTo("to_str")) {
            throw runtime.newTypeError("can't convert " + opt.getMetaClass() + " into String");
        }

        if (!str.respondsTo("to_str")) {
            throw runtime.newTypeError("can't convert " + str.getMetaClass() + " into String");
        }

        Map options = parseOpt(opt.convertToString().toString());

        if (options.get("input").getValue() == NKFCharset.AUTO.getValue()) {
            options.put("input", guess(context, str));
        }

        ByteList bstr = str.convertToString().getByteList();
        final Converter converter;
        if (Converter.isMimeText(bstr, options)) {
            converter = new MimeConverter(context, options);
        } else {
            converter = new DefaultConverter(context, options);
        }

        RubyString result = converter.convert(bstr);

        if (options.get("mime-encode") == NKFCharset.BASE64) {
            result = Converter.encodeMimeString(runtime, result, PACK_BASE64);
        } else if (options.get("mime-encode") == NKFCharset.QENCODE) {
            result = Converter.encodeMimeString(runtime, result, PACK_QENCODE);
        }

        return result;
    }

    public static Command parseOption(String s) {
        Options options = new Options();
        options.addOption("b");
        options.addOption("u");
        options.addOption("j", "jis");
        options.addOption("s", "sjis");
        options.addOption("e", "euc");
        options.addOption("w", null, "[0-9][0-9]");
        options.addOption("J", "jis-input");
        options.addOption("S", "sjis-input");
        options.addOption("E", "euc-input");
        options.addOption("W", null, "[0-9][0-9]");
        options.addOption("t");
        options.addOption("i_");
        options.addOption("o_");
        options.addOption("r");
        options.addOption("h1", "hiragana");
        options.addOption("h2", "katakana");
        options.addOption("h3", "katakana-hiragana");
        options.addOption("T");
        options.addOption("l");
        options.addOption("f", null, "[0-9]+-[0-9]*");
        options.addOption("F");
        options.addOption("Z", null, "[0-3]");
        options.addOption("X");
        options.addOption("x");
        options.addOption("B", null, "[0-2]");
        options.addOption("I");
        options.addOption("L", null, "[uwm]");
        options.addOption("d");
        options.addOption("c");
        options.addOption("m", null, "[BQN0]");
        options.addOption("M", null, "[BQ]");
        options.addOption(null, "fj");
        options.addOption(null, "unix");
        options.addOption(null, "mac");
        options.addOption(null, "msdos");
        options.addOption(null, "windows");
        options.addOption(null, "mime");
        options.addOption(null, "base64");
        options.addOption(null, "mime-input");
        options.addOption(null, "base64-input");
        options.addOption(null, "ic", "ic=(.*)");
        options.addOption(null, "oc", "oc=(.*)");
        options.addOption(null, "fb-skip");
        options.addOption(null, "fb-html");
        options.addOption(null, "fb-xml");
        options.addOption(null, "fb-perl");
        options.addOption(null, "fb-java");
        options.addOption(null, "fb-subchar", "fb-subchar=(.*)");
        options.addOption(null, "no-cp932ext");
        options.addOption(null, "cap-input");
        options.addOption(null, "url-input");
        options.addOption(null, "numchar-input");
        options.addOption(null, "no-best-fit-chars");

        CommandParser parser = new CommandParser();
        Command cmd = parser.parse(options, s);
        return cmd;
    }

    private static Map parseOpt(String s) {
        Map options = new HashMap();

        // default options
        options.put("input", NKFCharset.AUTO);
        options.put("output", NKFCharset.JIS);
        options.put("mime-decode", NKFCharset.MIME_DETECT);
        options.put("mime-encode", NKFCharset.NOCONV);

        Command cmd = parseOption(s);
        if (cmd.hasOption("j")) {
            options.put("output", NKFCharset.JIS);
        }
        if (cmd.hasOption("s")) {
            options.put("output", NKFCharset.SJIS);
        }
        if (cmd.hasOption("e")) {
            options.put("output", NKFCharset.EUC);
        }
        if (cmd.hasOption("w")) {
            Option opt = cmd.getOption("w");
            if ("32".equals(opt.getValue())) {
                options.put("output", NKFCharset.UTF32);
            } else if("16".equals(opt.getValue())) {
                options.put("output", NKFCharset.UTF16);
            } else {
                options.put("output", NKFCharset.UTF8);
            }
        }
        if (cmd.hasOption("J")) {
            options.put("input", NKFCharset.JIS);
        }
        if (cmd.hasOption("S")) {
            options.put("input", NKFCharset.SJIS);
        }
        if (cmd.hasOption("E")) {
            options.put("input", NKFCharset.EUC);
        }
        if (cmd.hasOption("W")) {
            Option opt = cmd.getOption("W");
            if ("32".equals(opt.getValue())) {
                options.put("input", NKFCharset.UTF32);
            } else if("16".equals(opt.getValue())) {
                options.put("input", NKFCharset.UTF16);
            } else {
                options.put("input", NKFCharset.UTF8);
            }
        }
        if (cmd.hasOption("m")) {
            Option opt = cmd.getOption("m");
            if (opt.getValue() == null) {
                options.put("mime-decode", NKFCharset.MIME_DETECT);
            } else if ("B".equals(opt.getValue())) {
                options.put("mime-decode", NKFCharset.BASE64);
            } else if ("Q".equals(opt.getValue())) {
                options.put("mime-decode", NKFCharset.QENCODE);
            } else if ("N".equals(opt.getValue())) {
                // TODO: non-strict option
            } else if ("0".equals(opt.getValue())) {
                options.put("mime-decode", NKFCharset.NOCONV);
            }
        }
        if (cmd.hasOption("M")) {
            Option opt = cmd.getOption("M");
            if (opt.getValue() == null) {
                options.put("mime-encode", NKFCharset.NOCONV);
            } else if ("B".equals(opt.getValue())) {
                options.put("mime-encode", NKFCharset.BASE64);
            } else if ("Q".equals(opt.getValue())) {
                options.put("mime-encode", NKFCharset.QENCODE);
            }
        }
        if (cmd.hasOption("base64")) {
            options.put("mime-encode", NKFCharset.BASE64);
        }
        if (cmd.hasOption("oc")) {
            Option opt = cmd.getOption("oc");
            if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.JIS);
            } else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.EUC);
            } else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.SJIS);
            } else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.SJIS);
            } else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.JIS);
            } else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF8);
            } else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF8);
            } else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF16);
            } else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF16);
            } else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF32);
            } else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("output", NKFCharset.UTF32);
            }
        }
        if (cmd.hasOption("ic")) {
            Option opt = cmd.getOption("ic");
            if ("ISO-2022-JP".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.JIS);
            } else if ("EUC-JP".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.EUC);
            } else if ("CP932".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.SJIS);
            } else if ("Shift_JIS".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.SJIS);
            } else if ("Windows-31J".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.SJIS);
            } else if ("UTF-8".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF8);
            } else if ("UTF-8N".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF8);
            } else if ("UTF-16".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF16);
            } else if ("UTF-16BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF16);
            } else if ("UTF-32".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF32);
            } else if ("UTF-32BE-BOM".compareToIgnoreCase(opt.getValue()) == 0) {
                options.put("input", NKFCharset.UTF32);
            }
        }

        return options;
    }

    static abstract class Converter {

        protected ThreadContext context;
        protected Map options;

        public Converter(ThreadContext ctx, Map opt) {
            context = ctx;
            options = opt;
        }

        static boolean isMimeText(ByteList str, Map options) {
            if (str.length() <= 6) {
                return false;
            }
            if (options.get("mime-decode") == NKFCharset.NOCONV) {
                return false;
            }
            if (str.indexOf(BEGIN_MIME_STRING) < 0) {
                return false;
            }
            if (str.lastIndexOf(END_MIME_STRING) < 0) {
                return false;
            }
            return true;
        }

        private static RubyString encodeMimeString(Ruby runtime, RubyString str, ByteList format) {
            RubyArray array = RubyArray.newArray(runtime, str);
            return Pack.pack(runtime, array, format).chomp(runtime.getCurrentContext());
        }

        abstract RubyString convert(ByteList str);

        ByteList convert_byte(ByteList str, String inputCharset, NKFCharset output) {
            String outputCharset = output.getCharset();

            if (inputCharset == null) {
                inputCharset = str.getEncoding().toString();
            }

            if (outputCharset.equals(inputCharset)) {
                return str.dup();
            }

            byte[] outCharsetBytes = outputCharset.getBytes();

            EConv ec = EncodingUtils.econvOpenOpts(context, inputCharset.getBytes(), outCharsetBytes, 0, context.nil);

            if (ec == null) {
                throw context.runtime.newArgumentError("invalid encoding pair: " + inputCharset + " to " + outputCharset);
            }

            ByteList converted = EncodingUtils.econvStrConvert(context, ec, str, EConvFlags.INVALID_REPLACE);

            converted.setEncoding(context.runtime.getEncodingService().findEncodingOrAliasEntry(outCharsetBytes).getEncoding());

            return converted;
        }
    }

    static class DefaultConverter extends Converter {

        public DefaultConverter(ThreadContext ctx, Map opt) {
            super(ctx, opt);
        }

        RubyString convert(ByteList str) {
            NKFCharset input = options.get("input");
            NKFCharset output = options.get("output");
            ByteList b = convert_byte(str,
                    input.getCharset(),
                    output);
            return context.runtime.newString(b);
        }
    }

    static class MimeConverter extends Converter {

        public MimeConverter(ThreadContext ctx, Map opt) {
            super(ctx, opt);
        }

        private String detectCharset(String charset) {
            if (charset.compareToIgnoreCase(NKFCharset.UTF8.getCharset()) == 0) {
                return NKFCharset.UTF8.getCharset();
            } else if (charset.compareToIgnoreCase(NKFCharset.JIS.getCharset()) == 0) {
                return NKFCharset.JIS.getCharset();
            } else if (charset.compareToIgnoreCase(NKFCharset.EUC.getCharset()) == 0) {
                return NKFCharset.EUC.getCharset();
            } else {
                return NKFCharset.ASCII.getCharset();
            }
        }

        private ByteList decodeMimeString(String str) {
            String[] mime = str.split("^=\\?|\\?|\\?=$");
            String charset = detectCharset(mime[1]);
            int encode = mime[2].charAt(0);
            ByteList body = new ByteList(mime[3].getBytes(), ASCIIEncoding.INSTANCE);

            final RubyArray array;
            if ('B' == encode || 'b' == encode) { // BASE64
                array = Pack.unpack(context.runtime, body, PACK_BASE64);
            } else { // Qencode
                array = Pack.unpack(context.runtime, body, PACK_QENCODE);
            }
            RubyString s = (RubyString) array.entry(0);
            ByteList decodeStr = s.asString().getByteList();

            return convert_byte(decodeStr, charset, options.get("output"));
        }

        RubyString makeRubyString(ArrayList list) {
            ByteList r = new ByteList();
            for (ByteList l : list) {
                r.append(l);
            }
            return context.runtime.newString(r);
        }

        RubyString convert(ByteList str) {
            String s = Helpers.decodeByteList(context.runtime, str);
            String[] token = s.split("\\s");
            ArrayList raw_data = new ArrayList();

            for (int i = 0; i < token.length; i++) {
                raw_data.add(decodeMimeString(token[i]));
            }

            return makeRubyString(raw_data);
        }

    }

    @Deprecated
    public static final NKFCharset AUTO = NKFCharset.AUTO;
    // no ISO-2022-JP in jcodings
    @Deprecated
    public static final NKFCharset JIS = NKFCharset.JIS;
    @Deprecated
    public static final NKFCharset EUC = NKFCharset.EUC;
    @Deprecated
    public static final NKFCharset SJIS = NKFCharset.SJIS;
    @Deprecated
    public static final NKFCharset BINARY = NKFCharset.BINARY;
    @Deprecated
    public static final NKFCharset NOCONV = NKFCharset.NOCONV;
    @Deprecated
    public static final NKFCharset UNKNOWN = NKFCharset.UNKNOWN;
    @Deprecated
    public static final NKFCharset ASCII = NKFCharset.ASCII;
    @Deprecated
    public static final NKFCharset UTF8 = NKFCharset.UTF8;
    @Deprecated
    public static final NKFCharset UTF16 = NKFCharset.UTF16;
    @Deprecated
    public static final NKFCharset UTF32 = NKFCharset.UTF32;
    @Deprecated
    public static final NKFCharset OTHER = NKFCharset.OTHER;
    @Deprecated
    public static final NKFCharset BASE64 = NKFCharset.BASE64;
    @Deprecated
    public static final NKFCharset QENCODE = NKFCharset.QENCODE;
    @Deprecated
    public static final NKFCharset MIME_DETECT = NKFCharset.MIME_DETECT;
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy