com.scudata.expression.fn.CharDetect Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of esproc Show documentation
Show all versions of esproc Show documentation
SPL(Structured Process Language) A programming language specially for structured data computing.
package com.scudata.expression.fn;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.scudata.common.*;
import com.scudata.dm.Env;
import com.scudata.dm.FileObject;
import com.scudata.dm.Sequence;
import org.mozilla.universalchardet.UniversalDetector;
/***************************************
*
* chardetect@v(p)
* ????p????Ϊ?ַ???????????ֵ??URL???ļ???
* ????ֵ??????????ֵʱΪString, ????Ϊ????
*
* */
public class CharDetect extends CharFunction {
List m_codes = new ArrayList();
protected Object doQuery(Object[] objs) {
List result = null;
try {
m_codes.clear();
if (objs==null || objs.length<1){
throw new Exception("chardet paramSize error!");
}
if (objs.length>=2){
if (objs[1] instanceof Sequence){
Sequence seq = (Sequence)objs[1];
for(int i=1; i<=seq.length(); i++){
m_codes.add(seq.get(i).toString().toUpperCase());
}
}
}
// check encoding for string
if(option!=null && option.contains("v")){
byte[] buf = null;
CharEncodingDetect detector = new CharEncodingDetect();
if(objs[0] instanceof String){
String str = objs[0].toString();
buf = str.getBytes();
result = detector.autoDetectEncoding(buf);
}else if(objs[0] instanceof byte[]){
buf = (byte[])objs[0];
result = detector.autoDetectEncoding(buf);
}
if (result==null){
String encoding = CharEncodingDetectEx.getJavaEncode(buf);
result = new ArrayList();
result.add(encoding);
}
List rep = new ArrayList();
if (m_codes.size()>0){
for(String item:result){
//System.out.println("val = "+item);
if (m_codes.indexOf(item.toUpperCase())>-1){
rep.add(item);
}
}
if (rep.size()>0){
if (option!=null && option.contains("a")){
return new Sequence(rep.toArray(new String[rep.size()]));
}else{
return rep.get(0);
}
}
}else{
if (option!=null && option.contains("a")){
return new Sequence(result.toArray(new String[result.size()]));
}else{
return result.get(0);
}
}
}else if(objs[0] instanceof String){
String sTmp = objs[0].toString();
String reg = "^(https?)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";
if (isMatch(sTmp, reg)){ // for url
return detectEncoding(new URL(sTmp));
}else{ // for file
return detectCharsetFile(sTmp);
}
}else if(objs[0] instanceof FileObject){
FileObject fo = (FileObject)objs[0];
return detectCharsetFile(fo.getFileName());
}
} catch (Exception e) {
Logger.error(e.getMessage());
}
return null;
}
private Object detectCharsetFile(String sfile) throws IOException
{
File file = new File(sfile);
if (file.exists()){
return getFileCharset(file);
}
String fullFile = null;
// 1. ?û????õ?main
String path = Env.getMainPath();
if (path!=null){
fullFile = path+File.separatorChar+sfile;
file = new File(fullFile);
if (file.exists()){
return getFileCharset(file);
}
}
// 2. ϵͳ?Դ???main
path = System.getProperty("start.home");
fullFile = path+File.separatorChar+"main"+File.separatorChar+sfile;
file = new File(fullFile);
if (file.exists()){
return getFileCharset(file);
}
// 3. ϵͳ?Դ???demo
fullFile = path+File.separatorChar+"demo"+File.separatorChar+sfile;
file = new File(fullFile);
if (file.exists()){
return getFileCharset(file);
}else{
Logger.info("File: "+ sfile +" not existed.");
}
return null;
}
private String detectEncoding(InputStream stream, Object obj) throws IOException {
UniversalDetector detector = new UniversalDetector(null);
int read;
byte[] buffer = new byte[4096];
while ((read = stream.read(buffer)) > 0 && !detector.isDone()) {
detector.handleData(buffer, 0, read);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding ==null){
encoding = CharEncodingDetectEx.getJavaEncode(obj);
}else{
BytesEncodingDetect s = new BytesEncodingDetect();
List list= Arrays.asList(EncodingEx.javaname);
if(list.indexOf(encoding)==-1){
encoding = CharEncodingDetectEx.getJavaEncode(obj);
}
}
if (m_codes.size()>0){
if (m_codes.indexOf(encoding)!=-1){
return encoding;
}else{
return null;
}
}
return encoding;
}
private String detectEncoding(URL url) {
InputStream istream;
try {
istream = url.openStream();
String code = detectEncoding(istream, url);
istream.close();
return code;
} catch (Exception e) {
Logger.error("Error loading or using URL " + e.toString());
}
return null;
}
// ͨ??Url??ȡ????????port, warehouse
private boolean isMatch(String strUrl, String regExp)
{
if (strUrl==null || strUrl.isEmpty()){
throw new RQException("spark isMatch strUrl is empty");
}
if (regExp==null || regExp.isEmpty()){
throw new RQException("spark isMatch regExp is empty");
}
Pattern p=Pattern.compile(regExp);
Matcher m = p.matcher(strUrl);
return m.matches();
}
private String getFileCharset(File file) throws IOException {
FileInputStream ins = new FileInputStream(file);
String code = detectEncoding(ins, file);
ins.close();
return code;
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy