All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jruby.util.encoding.Transcoder Maven / Gradle / Ivy

/*
 **** BEGIN LICENSE BLOCK *****
 * Version: EPL 1.0/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Eclipse Public
 * License Version 1.0 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.eclipse.org/legal/epl-v10.html
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * Copyright (C) 2013 The JRuby Community (jruby.org)
 * 
 * Alternatively, the contents of this file may be used under the terms of
 * either of the GNU General Public License Version 2 or later (the "GPL"),
 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the EPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the EPL, the GPL or the LGPL.
 ***** END LICENSE BLOCK *****/
package org.jruby.util.encoding;

import java.nio.charset.Charset;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.jcodings.Encoding;
import org.jcodings.EncodingDB;
import org.jcodings.specific.ASCIIEncoding;
import org.jruby.Ruby;
import org.jruby.RubyString;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.jruby.util.ByteList;
import org.jruby.util.StringSupport;
import org.jruby.util.io.EncodingUtils;

/**
 * Abstract superclass for all transcoders.
 * 
 * This and its implementations are roughly equivalent to rb_econv_t in MRI.
 */
public abstract class Transcoder {
    protected final Ruby runtime;
    public final Encoding outEncoding;
    public final Encoding inEncoding;
    public RubyCoderResult lastResult;
    private RaiseException lastError;
    
    public Transcoder(ThreadContext context, Encoding outEncoding, Encoding inEncoding) {
        this.runtime = context.runtime;
        this.outEncoding = outEncoding;
        this.inEncoding = inEncoding;
    }
    
    // rb_econv_open
    public static Transcoder open(ThreadContext context, byte[] sourceEncoding, byte[] destinationEncoding, int ecflags, IRubyObject replacement) {
        // TODO: decorator finish logic
        
        // TODO: lighter-weight pass for non-transcoding with decorators (NullTranscoder)
        
        // TODO: set error handler mask for decorator logic
//        Transcoder transcoder = open0(ThreadContext context, sourceEncoding, destinationEncoding, ecflags & EncodingUtils.ECONV_ERROR_HANDLER_MASK);
        Transcoder transcoder = open0(context, sourceEncoding, destinationEncoding, ecflags, replacement);
        
        if (transcoder == null) return null;
        
        // TODO: decorator finish logic
        
        // TODO: clear error handler mask
        
        return transcoder;
    }
    
    public static Transcoder open0(ThreadContext context, byte[] sourceEncoding, byte[] destinationEncoding, int ecflags, IRubyObject replacement) {
        Encoding senc, denc;
        
        senc = null;
        if (sourceEncoding.length > 0) {
            EncodingDB.Entry src = context.runtime.getEncodingService().findEncodingOrAliasEntry(new ByteList(sourceEncoding, false));
            if (src != null) {
                senc = src.getEncoding();
            }
        }
        
        denc = null;
        if (destinationEncoding.length > 0) {
            EncodingDB.Entry dest = context.runtime.getEncodingService().findEncodingOrAliasEntry(new ByteList(destinationEncoding, false));
            if (dest != null) {
                denc = dest.getEncoding();
            }
        }
        
        if (sourceEncoding.length == 0 && destinationEncoding.length == 0) {
            // no transcoding; for our purposes, we force both to be binary
            senc = denc = ASCIIEncoding.INSTANCE;
        } else {
            Charset from;
            Charset to;
            
            // inefficient; doing Charset lookup here *and* in the transcoder
            if (CharsetTranscoder.transcodeCharsetFor(context.runtime, sourceEncoding, senc, false) == null ||
                    CharsetTranscoder.transcodeCharsetFor(context.runtime, destinationEncoding, denc, false) == null) {
                return null;
            }
        }
            
        return new CharsetTranscoder(context,
                denc,
                senc,
                ecflags,
                replacement);
    }
    
    /**
     * This will try and transcode the supplied ByteList to the supplied toEncoding.  It will use
     * forceEncoding as its encoding if it is supplied; otherwise it will use the encoding it has
     * tucked away in the bytelist.  This will return a new copy of a ByteList in the request
     * encoding or die trying (ConverterNotFound).
     * 
     * c: rb_str_conv_enc_opts
     */
    public static ByteList strConvEncOpts(ThreadContext context, ByteList value, Encoding fromEncoding,
            Encoding toEncoding, int ecflags, IRubyObject ecopts) {
        if (toEncoding == null) return value;
        if (fromEncoding == null) fromEncoding = value.getEncoding();
        if (fromEncoding == toEncoding) return value;
        
        // This logic appears to not work like in MRI; following code will not
        // properly decode the string:
        // "\x00a".force_encoding("ASCII-8BIT").encode("UTF-8", "UTF-16BE")
        if ((toEncoding.isAsciiCompatible() && StringSupport.codeRangeScan(value.getEncoding(), value) == StringSupport.CR_7BIT) ||
                toEncoding == ASCIIEncoding.INSTANCE) {
            if (value.getEncoding() != toEncoding) {
                value = value.shallowDup();
                value.setEncoding(toEncoding);
            }
            return value;
        }
        
        Transcoder ec = EncodingUtils.econvOpenOpts(context, fromEncoding.getName(), toEncoding.getName(), ecflags, ecopts);
        if (ec == null) return value;

        ByteList newStr = new ByteList();
        RubyCoderResult ret = ec.econvConvert(context, value, newStr);

        if (ret == null || ret.stringResult.equals("finished")) {
            newStr.setEncoding(toEncoding);
            return newStr;
        } else {
            // error result, failover to original
            return value;
        }
    }
    
    // rb_str_conv_enc
    public static ByteList strConvEnc(ThreadContext context, ByteList value, Encoding fromEncoding, Encoding toEncoding) {
        return strConvEncOpts(context, value, fromEncoding, toEncoding, 0, context.nil);
    }
    
    public static ByteList transcode(ThreadContext context, ByteList value, Encoding fromEncoding,
            Encoding toEncoding, IRubyObject opts, boolean is7BitASCII) {
        if (toEncoding == null) return value;
        if (fromEncoding == null) fromEncoding = value.getEncoding();
        if (fromEncoding == toEncoding) return value;
        
        // This logic appears to not work like in MRI; following code will not
        // properly decode the string:
        // "\x00a".force_encoding("ASCII-8BIT").encode("UTF-8", "UTF-16BE")
        /*
        if ((toEncoding.isAsciiCompatible() && is7BitASCII) ||
                toEncoding == ASCIIEncoding.INSTANCE) {
            if (value.getEncoding() != toEncoding) {
                value = value.shallowDup();
                value.setEncoding(toEncoding);
            }
            return value;
        }
        */
        
        return new CharsetTranscoder(context, toEncoding, fromEncoding, CharsetTranscoder.processCodingErrorActions(context, opts)).transcode(context, value, is7BitASCII);
    }
    
    // rb_econv_convert
    public abstract RubyCoderResult transcode(ThreadContext context, ByteList value, ByteList dest);

    public abstract RubyCoderResult econvConvert(ThreadContext context, ByteList value, ByteList dest);
    
    public abstract ByteList transcode(ThreadContext context, ByteList value);
    
    public abstract ByteList transcode(ThreadContext context, ByteList value, boolean is7BitASCII);
    
    // from Converter#convert
    public abstract ByteList convert(ThreadContext context, ByteList value, boolean is7BitASCII);
    
    public abstract ByteList econvStrConvert(ThreadContext context, ByteList value, boolean finish);
    
    public abstract RubyCoderResult primitiveConvert(ThreadContext context, ByteList inBuffer, ByteList outBuffer, int outOffset, int outLimit, Encoding inEncoding, boolean is7BitASCII, int flags);
    
    public abstract ByteList finish(Encoding altEncoding);
    
    public RubyCoderResult getLastResult() {
        return lastResult;
    }
    
    public RaiseException getLastError() {
        createLastError();
        
        return lastError;
    }

    private void createLastError() {
        if (lastResult != null) {
            if (lastResult.isError()) {
                RubyString errorBytes = runtime.newString(new ByteList(lastResult.errorBytes, ASCIIEncoding.INSTANCE, true));
                errorBytes.setEncoding(ASCIIEncoding.INSTANCE);

                // handle error
                if (lastResult.isInvalid()) {
                    // FIXME: gross error message construction
                    lastError = runtime.newInvalidByteSequenceError("\"" + errorBytes.inspect19().toString() + "\" on " + lastResult.inEncoding);
                    lastError.getException().dataWrapStruct(lastResult);
                } else if (lastResult.isUndefined()) {
                    // FIXME: gross error message construction
                    lastError = runtime.newUndefinedConversionError("\"" + errorBytes.inspect19().toString() + "\" from " + lastResult.inEncoding + " to " + lastResult.outEncoding);
                    lastError.getException().dataWrapStruct(lastResult);
                }
            }
        }
    }
    
    public static final Set UNICODE_CHARSETS;
    static {
        Set charsets = new HashSet();
        
        charsets.add(Charset.forName("UTF-8"));
        charsets.add(Charset.forName("UTF-16"));
        charsets.add(Charset.forName("UTF-16BE"));
        charsets.add(Charset.forName("UTF-16LE"));
        charsets.add(Charset.forName("UTF-32"));
        charsets.add(Charset.forName("UTF-32BE"));
        charsets.add(Charset.forName("UTF-32LE"));
        
        UNICODE_CHARSETS = Collections.unmodifiableSet(charsets);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy