All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.scudata.expression.fn.CharDetect Maven / Gradle / Ivy

Go to download

SPL(Structured Process Language) A programming language specially for structured data computing.

There is a newer version: 20240823
Show newest version
package com.scudata.expression.fn;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.scudata.common.*;
import com.scudata.dm.Env;
import com.scudata.dm.FileObject;
import com.scudata.dm.Sequence;
import org.mozilla.universalchardet.UniversalDetector;

/***************************************
 * 
 * chardetect@v(p)
 * ????p????Ϊ?ַ???????????ֵ??URL???ļ???
 * ????ֵ??????????ֵʱΪString, ????Ϊ????
 * 
 * */

public class CharDetect extends CharFunction {
	List m_codes = new ArrayList();
	protected Object doQuery(Object[] objs) {
		List result = null; 
		try {
			m_codes.clear();
			if (objs==null || objs.length<1){
				throw new Exception("chardet paramSize error!");
			}
			
			if (objs.length>=2){
				if (objs[1] instanceof Sequence){
					Sequence seq = (Sequence)objs[1];
					for(int i=1; i<=seq.length(); i++){
						m_codes.add(seq.get(i).toString().toUpperCase());
					}
				}
			}
			
			// check encoding for string
			if(option!=null && option.contains("v")){
				byte[] buf = null;
				CharEncodingDetect detector = new CharEncodingDetect();
				if(objs[0] instanceof String){					
					String str = objs[0].toString();
					buf = str.getBytes();
					result = detector.autoDetectEncoding(buf);					
				}else if(objs[0] instanceof byte[]){
					buf = (byte[])objs[0];
					result = detector.autoDetectEncoding(buf);
				}	
				
				if (result==null){
					String encoding = CharEncodingDetectEx.getJavaEncode(buf);
					result = new ArrayList();
					result.add(encoding);
				}

				List rep = new ArrayList();
				if (m_codes.size()>0){						
					for(String item:result){
						//System.out.println("val = "+item);
						if (m_codes.indexOf(item.toUpperCase())>-1){
							rep.add(item);
						}
					}
					if (rep.size()>0){						
						if (option!=null && option.contains("a")){
							return new Sequence(rep.toArray(new String[rep.size()]));
						}else{
							return rep.get(0);
						}
					}
				}else{
					if (option!=null && option.contains("a")){
						return new Sequence(result.toArray(new String[result.size()]));
					}else{
						return result.get(0);
					}
				}				
			}else if(objs[0] instanceof String){ 
				String sTmp = objs[0].toString();				
				String reg = "^(https?)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
				if (isMatch(sTmp, reg)){ // for url
					return detectEncoding(new URL(sTmp));
				}else{ // for file
					return detectCharsetFile(sTmp);
				}
			}else if(objs[0] instanceof FileObject){
				FileObject fo = (FileObject)objs[0];
				
				return detectCharsetFile(fo.getFileName());
			}
		} catch (Exception e) {
			Logger.error(e.getMessage());
		}
		
		return null;
	}
	
	private Object detectCharsetFile(String sfile) throws IOException
	{
		File file = new File(sfile);
		if (file.exists()){						
			return getFileCharset(file);
		}
		String fullFile = null;
		
		// 1. ?û????õ?main
		String path = Env.getMainPath();
		if (path!=null){
			fullFile = path+File.separatorChar+sfile;
			file = new File(fullFile);
			if (file.exists()){	
				return getFileCharset(file);
			}
		}
		
		// 2. ϵͳ?Դ???main
		path = System.getProperty("start.home");
		fullFile = path+File.separatorChar+"main"+File.separatorChar+sfile;
		file = new File(fullFile);
		if (file.exists()){	
			return getFileCharset(file);
		}
		// 3. ϵͳ?Դ???demo
		fullFile = path+File.separatorChar+"demo"+File.separatorChar+sfile;
		file = new File(fullFile);
		if (file.exists()){	
			return getFileCharset(file);
		}else{
			Logger.info("File: "+ sfile +" not existed.");
		}
		
		return null;
	}
		
    private String detectEncoding(InputStream stream, Object obj) throws IOException {
        UniversalDetector detector = new UniversalDetector(null);

        int read;
        byte[] buffer = new byte[4096];
        while ((read = stream.read(buffer)) > 0 && !detector.isDone()) {
            detector.handleData(buffer, 0, read);
        }
        detector.dataEnd();

        String encoding = detector.getDetectedCharset();
        detector.reset();
        
	    if (encoding ==null){
	    	encoding = CharEncodingDetectEx.getJavaEncode(obj);
	    }else{
	    	BytesEncodingDetect s = new BytesEncodingDetect(); 
	    	List list= Arrays.asList(EncodingEx.javaname);
    		if(list.indexOf(encoding)==-1){
    			encoding = CharEncodingDetectEx.getJavaEncode(obj);
    		}	    	
	    }
	    
	    if (m_codes.size()>0){
	    	if (m_codes.indexOf(encoding)!=-1){
	    		return encoding;
	    	}else{
	    		return null;
	    	}
	    }
        return encoding;
    }
    
    private String detectEncoding(URL url) {
		InputStream istream;
		try {
			istream = url.openStream();
			String code = detectEncoding(istream, url);
			istream.close();
			return code;			
		} catch (Exception e) {
			Logger.error("Error loading or using URL " + e.toString());			
		}
		
		return null;
	}
	
	// ͨ??Url??ȡ????????port, warehouse
	private boolean isMatch(String strUrl, String regExp)
	{
		if (strUrl==null || strUrl.isEmpty()){
			throw new RQException("spark isMatch strUrl is empty");
		}
		
		if (regExp==null || regExp.isEmpty()){
			throw new RQException("spark isMatch regExp is empty");
		}
		
		Pattern p=Pattern.compile(regExp);
		Matcher m = p.matcher(strUrl);
		
		return m.matches();
	}
	
	private String getFileCharset(File file) throws IOException {
		FileInputStream ins = new FileInputStream(file);
	    String code = detectEncoding(ins, file);
	    ins.close();
	    
	    return code;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy