All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hankcs.hanlp.dictionary.common.CommonDictionary Maven / Gradle / Ivy

There is a newer version: portable-1.8.5
Show newest version
/*
 * 
 * He Han
 * [email protected]
 * 2014/9/9 22:30
 *
 * 
 * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
 * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
 * 
 */
package com.hankcs.hanlp.dictionary.common;

import com.hankcs.hanlp.collection.trie.DoubleArrayTrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.BufferedReader;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.BIN_EXT;
import static com.hankcs.hanlp.utility.Predefine.logger;

/**
 * 通用的词典,对应固定格式的词典,但是标签可以泛型化
 *
 * @author hankcs
 */
public abstract class CommonDictionary
{
    DoubleArrayTrie trie;

    /**
     * 从字节中加载值数组
     *
     * @param byteArray
     * @return
     */
    protected abstract V[] loadValueArray(ByteArray byteArray);

    /**
     * 从txt路径加载
     *
     * @param path
     * @return
     */
    public boolean load(String path)
    {
        trie = new DoubleArrayTrie();
        long start = System.currentTimeMillis();
        if (loadDat(ByteArray.createByteArray(path + BIN_EXT)))
        {
            return true;
        }
        TreeMap map = new TreeMap();
        try
        {
            BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8"));
            String line;
            while ((line = br.readLine()) != null)
            {
                String[] paramArray = line.split("\\s");
                map.put(paramArray[0], createValue(paramArray));
            }
            br.close();
        }
        catch (Exception e)
        {
            logger.warning("读取" + path + "失败" + e);
        }
        onLoaded(map);
        Set> entrySet = map.entrySet();
        List keyList = new ArrayList(entrySet.size());
        List valueList = new ArrayList(entrySet.size());
        for (Map.Entry entry : entrySet)
        {
            keyList.add(entry.getKey());
            valueList.add(entry.getValue());
        }
        int resultCode = trie.build(keyList, valueList);
        if (resultCode != 0)
        {
            logger.warning("trie建立失败");
            return false;
        }
        logger.info(path + "加载成功,耗时" + (System.currentTimeMillis() - start) + "ms");
        saveDat(path + BIN_EXT, valueList);
        return true;
    }

    /**
     * 从dat路径加载
     *
     * @param byteArray
     * @return
     */
    protected boolean loadDat(ByteArray byteArray)
    {
        V[] valueArray = loadValueArray(byteArray);
        if (valueArray == null)
        {
            return false;
        }
        return trie.load(byteArray, valueArray);
    }

    /**
     * 保存dat到路径
     *
     * @param path
     * @param valueArray
     * @return
     */
    protected boolean saveDat(String path, List valueArray)
    {
        try
        {
            DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(path));
            out.writeInt(valueArray.size());
            for (V item : valueArray)
            {
                saveValue(item, out);
            }
            trie.save(out);
            out.close();
        }
        catch (Exception e)
        {
            logger.warning("保存失败" + TextUtility.exceptionToString(e));
            return false;
        }
        return true;
    }

    /**
     * 保存单个值到流中
     *
     * @param value
     * @param out
     * @throws IOException
     */
    protected abstract void saveValue(V value, DataOutputStream out) throws IOException;

    /**
     * 查询一个单词
     *
     * @param key
     * @return 单词对应的条目
     */
    public V get(String key)
    {
        return trie.get(key);
    }

    /**
     * 是否含有键
     *
     * @param key
     * @return
     */
    public boolean contains(String key)
    {
        return get(key) != null;
    }

    /**
     * 词典大小
     *
     * @return
     */
    public int size()
    {
        return trie.size();
    }

    /**
     * 从一行词典条目创建值
     *
     * @param params 第一个元素为键,请注意跳过
     * @return
     */
    protected abstract V createValue(String[] params);

    /**
     * 文本词典加载完毕的回调函数
     *
     * @param map
     */
    protected void onLoaded(TreeMap map)
    {
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy