All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.nlpcn.commons.lang.dat.DATMaker Maven / Gradle / Ivy

package org.nlpcn.commons.lang.dat;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;

import org.nlpcn.commons.lang.tire.domain.SmartForest;
import org.nlpcn.commons.lang.util.FileIterator;
import org.nlpcn.commons.lang.util.IOUtil;
import org.nlpcn.commons.lang.util.StringUtil;

/**
 * dat maker
 *
 * @author ansj
 */
public class DATMaker {

	private static final Logger LOG = Logger.getLogger(DATMaker.class.getName());

	// 动态数组扩容
	private final int capacity = 100000;
	private Item[] dat = new Item[capacity];

	/**
	 * 构建默认的DAT
	 */
	public void maker(final String dicPath) throws Exception {
		maker(dicPath, BasicItem.class);
	}

	/**
	 * 构建用户自定义的dat
	 * 
	 * @throws FileNotFoundException
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 */
	public void maker(final String dicPath, final Class cla) throws FileNotFoundException, InstantiationException, IllegalAccessException {
		long start = System.currentTimeMillis();
		LOG.info("make basic tire begin !");

		final SmartForest forest = new SmartForest();
		final FileIterator it = IOUtil.instanceFileIterator(dicPath, IOUtil.UTF8);
		if (it == null) {
			throw new FileNotFoundException();
		}
		try {
			String temp;
			while (it.hasNext()) {
				temp = it.next();
				if (StringUtil.isBlank(temp)) {
					continue;
				}
				final Item item = cla.newInstance();
				final String[] split = temp.split("\t");
				item.init(split);
				forest.add(split[0], item);
			}
		} finally {
			it.close();
		}
		LOG.info("make basic tire over use time " + (System.currentTimeMillis() - start) + " ms");

		start = System.currentTimeMillis();
		LOG.info("make dat tire begin !");
		makeDAT(tree2List(cla, forest));
		LOG.info("make dat tire over use time " + (System.currentTimeMillis() - start) + " ms! dat len is " + datArrLen() + "! dat size is " + datItemSize());

	}

	private void makeDAT(final List all) {
		// all 就是tire树中没一个前缀集合
		for (int i = 0; i < all.size(); i++) {
			final Item item = all.get(i);
			final char[] chars = item.name.toCharArray();// 每个节点中的词.
			final int length = chars.length;
			// 如果词长度为一.直接放置到ascii码的位置上.并且保证此字的值大于65536
			if (length == 1) {
				item.check = -1;
				item.index = chars[0];
				dat[item.index] = item;
			} else {
				// 得道除了尾字啊外的位置,比如 "中国人" 此时返回的是"中国"的Item
				// 前缀是否相同,如果相同保存在临时map中.直到不同
				final Item pre = getPre(item);// 用来保留默认前缀
				final List group = findGroup(all, i, pre);
				item2DAT(pre, group);
				i = i + group.size() - 1;
			}
		}
	}

	/**
	 * 找到相同的组
	 */
	private List findGroup(final List all, final int i, final Item pre) {
		final List group = new ArrayList();
		group.add(all.get(i));
		for (int j = i + 1; j < all.size(); j++) {
			final Item tmp = all.get(j);
			if (pre == getPre(tmp)) {
				group.add(tmp);
			} else {
				break;
			}
		}
		return group;
	}

	/**
	 * 将迭代结果存入dat
	 */
	private void item2DAT(final Item pre, final List samePreGroup) {
		updateBaseValue(samePreGroup, pre);
		// 处理完冲突后将这些值填充到双数组中
		for (final Item tmp : samePreGroup) {
			tmp.index = pre.base + getLastChar(tmp.name);
			dat[pre.base + getLastChar(tmp.name)] = tmp;
			tmp.check = pre.index;
		}
	}

	/**
	 * 更新上一级的base值,直到冲突解决
	 */
	private void updateBaseValue(final List samePreGroup, final Item pre) {
		Iterator iterator = samePreGroup.iterator();
		while (iterator.hasNext()) {
			final Item item = iterator.next();

			checkLength(pre.base + getLastChar(item.name));

			if (dat[pre.base + getLastChar(item.name)] != null) {
				pre.base++;
				iterator = samePreGroup.iterator();
			}
		}
	}

	/**
	 * 检查数组长度并且扩容
	 */
	private void checkLength(final int len) {
		if (len >= dat.length) {
			dat = Arrays.copyOf(dat, len + capacity);
		}
	}

	/**
	 * 获得一个词语的最后一个字符
	 */
	public char getLastChar(final String word) {
		return word.charAt(word.length() - 1);
	}

	/**
	 * 找到该字符串上一个的位置字符串上一个的位置
	 *
	 * @param item
	 *            传入的字符串char数组
	 */
	public Item getPre(final Item item) {
		final char[] chars = item.name.toCharArray();
		int tempBase = 0;
		if (chars.length == 2) {
			return dat[chars[0]];
		}
		for (int i = 0; i < chars.length - 1; i++) {
			if (i == 0) {
				tempBase += chars[i];
			} else {
				tempBase = dat[tempBase].base + chars[i];
			}
		}
		return dat[tempBase];
	}

	/**
	 * 将tire树 广度遍历为List
	 * 
	 * @throws InstantiationException
	 */
	private List tree2List(final Class cla, final SmartForest forest) throws InstantiationException, IllegalAccessException {
		final List all = new ArrayList();
		treeToLibrary(cla, all, forest, "");
		return all;
	}

	/**
	 * 广度遍历
	 * 
	 * @throws IllegalAccessException
	 * @throws InstantiationException
	 */
	private void treeToLibrary(final Class cla, final List all, final SmartForest sf, final String preStr)
			throws InstantiationException, IllegalAccessException {
		final SmartForest[] branches = sf.getBranches();
		if (branches == null) {
			return;
		}

		for (final SmartForest branche : branches) {
			if (branche == null) {
				continue;
			}
			// 将branch的状态赋予
			Item param = branche.getParam();
			if (param == null) {
				param = cla.newInstance();
				param.name = preStr + (branche.getC());
				param.status = 1;
			} else {
				param.status = branche.getStatus();
			}
			all.add(param);
		}

		for (final SmartForest branche : branches) {
			if (branche != null) {
				treeToLibrary(cla, all, branche, preStr + (branche.getC()));
			}
		}
	}

	/**
	 * 取得数组的真实长度
	 */
	private int datArrLen() {
		for (int i = this.dat.length - 1; i >= 0; i--) {
			if (this.dat[i] != null) {
				return i + 1;
			}
		}
		return 0;
	}

	/**
	 * 取得数组的真实长度
	 */
	private int datItemSize() {
		int size = 0;
		for (int i = this.dat.length - 1; i >= 0; i--) {
			if (this.dat[i] != null) {
				size++;
			}
		}
		return size;
	}

	/**
	 * 获得dat数组
	 */
	public Item[] getDAT() {
		return this.dat;
	}

	/**
	 * 序列化dat对象
	 */
	public void save(final String path) throws IOException {
		ObjectOutput writer = null;
		try {
			writer = new ObjectOutputStream(new FileOutputStream(path));
			writer.writeInt(datArrLen());
			writer.writeInt(datItemSize());
			for (Item item : dat) {
				if (item == null) {
					continue;
				}
				writer.writeObject(item);
			}
		} finally {
			if (writer != null) {
				writer.flush();
				writer.close();
			}
		}
	}

	/**
	 * 保存到可阅读的文本.需要重写这个类
	 * 
	 * @throws IOException
	 */
	public void saveText(final String path) throws IOException {
		Writer writer = null;
		try {
			writer = new FileWriter(new File(path));
			writer.write(String.valueOf(datArrLen()));
			writer.write('\n');
			for (final Item item : dat) {
				if (item != null) {
					writer.write(item.toString());
					writer.write('\n');
				}
			}
			writer.flush();
		} finally {
			if (writer != null) {
				writer.flush();
				writer.close();
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy