All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.opslab.util.algorithmImpl.StringImpl Maven / Gradle / Ivy

package com.opslab.util.algorithmImpl;

import com.opslab.util.CharsetUtil;
import com.opslab.util.SysUtil;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;

/**
 * 对比俩个字符串的相似度
 */
public class StringImpl {
    //第一种实现方式
    private static String longestCommonSubstring(String strA, String strB) {
        char[] chars_strA = strA.toCharArray();
        char[] chars_strB = strB.toCharArray();
        int m = chars_strA.length;
        int n = chars_strB.length;
        int[][] matrix = new int[m + 1][n + 1];
        for (int i = 1; i <= m; i++) {
            for (int j = 1; j <= n; j++) {
                if (chars_strA[i - 1] == chars_strB[j - 1])
                    matrix[i][j] = matrix[i - 1][j - 1] + 1;
                else
                    matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);
            }
        }
        char[] result = new char[matrix[m][n]];
        int currentIndex = result.length - 1;
        while (matrix[m][n] != 0) {
            if (matrix[n] == matrix[n - 1])
                n--;
            else if (matrix[m][n] == matrix[m - 1][n])
                m--;
            else {
                result[currentIndex] = chars_strA[m - 1];
                currentIndex--;
                n--;
                m--;
            }
        }
        return new String(result);
    }

    private static boolean charReg(char charValue) {
        return (charValue >= 0x4E00 && charValue <= 0X9FA5) || (charValue >= 'a' && charValue <= 'z') || (charValue >= 'A' && charValue <= 'Z') || (charValue >= '0' && charValue <= '9');
    }

    private static String removeSign(String str) {
        StringBuffer sb = new StringBuffer();
        for (char item : str.toCharArray()){
            if (charReg(item)) {
                sb.append(item);
            }
        }
        return sb.toString();
    }

    /**
     * 快速比较俩个字符串的相似度
     *
     * @param strA 较长的字符串
     * @param strB 较短的字符串
     * @return 俩个字符串的相似度
     * 

summary

:较长的字符串放到前面有助于提交效率 */ public static double SimilarDegree(String strA, String strB) { String newStrA = removeSign(strA); String newStrB = removeSign(strB); int temp = Math.max(newStrA.length(), newStrB.length()); int temp2 = longestCommonSubstring(newStrA, newStrB).length(); return temp2 * 1.0 / temp; } //第二种实现方式 private static int compare(String str, String target) { int d[][]; // 矩阵 int n = str.length(); int m = target.length(); int i; // 遍历str的 int j; // 遍历target的 char ch1; // str的 char ch2; // target的 int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1 if (n == 0) { return m; } if (m == 0) { return n; } d = new int[n + 1][m + 1]; for (i = 0; i <= n; i++) { // 初始化第一列 d[i][0] = i; } for (j = 0; j <= m; j++) { // 初始化第一行 d[0][j] = j; } for (i = 1; i <= n; i++) { // 遍历str ch1 = str.charAt(i - 1); // 去匹配target for (j = 1; j <= m; j++) { ch2 = target.charAt(j - 1); if (ch1 == ch2) { temp = 0; } else { temp = 1; } // 左边+1,上边+1, 左上角+temp取最小 d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + temp); } } return d[n][m]; } private static int min(int one, int two, int three) { return (one = one < two ? one : two) < three ? one : three; } /** * 获取字符串的相似度 * * @param str * @param target * @return */ public static double SimilarityRatio(String str, String target) { return 1 - (double) compare(str, target) / Math.max(str.length(), target.length()); } /** * 获取字符串的编码 *

* Summary:该方法利用cpDetector概率探测故性能和准确率不是100% * * @param str 需要处理的字符串 */ public static String encoding(String str) { java.nio.charset.Charset charset = null; try { InputStream inputStream = new ByteArrayInputStream(str.getBytes()); charset = cpDetector.codepageDetector.detectCodepage(inputStream, 3); if (charset != null) { return charset.name(); } } catch (Exception e) { e.printStackTrace(); } return null; } /** * 获取字符串编码 * * @param str 需要处理的字符串 */ public static String simpleEncoding(String str) { try{ byte[] bs = str.getBytes(SysUtil.JVM_ENCODING); if(str.equals(new String(bs,CharsetUtil.UTF_8))){ return CharsetUtil.UTF_8; } if(str.equals(new String(bs,CharsetUtil.GBK))){ return CharsetUtil.GBK; } if(str.equals(new String(bs,"ISO-8859-1"))){ return "ISO-8859-1"; } }catch(UnsupportedEncodingException e) { System.out.println("111111111"); e.printStackTrace(); } String encode = "GB2312"; try { if (str.equals(new String(str.getBytes(encode), encode))) { return encode; } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } encode = "ISO-8859-1"; try { if (str.equals(new String(str.getBytes(encode), encode))) { return encode; } } catch (UnsupportedEncodingException exception1) { exception1.printStackTrace(); } encode = "UTF-8"; try { if (str.equals(new String(str.getBytes(encode), encode))) { return encode; } } catch (UnsupportedEncodingException exception1) { exception1.printStackTrace(); } encode = "GBK"; try { if (str.equals(new String(str.getBytes(encode), encode))) { return encode; } } catch (UnsupportedEncodingException exception1) { exception1.printStackTrace(); } return ""; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy