All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fnlp.util.AV Maven / Gradle / Ivy

/**
*  This file is part of FNLP (formerly FudanNLP).
*  
*  FNLP is free software: you can redistribute it and/or modify
*  it under the terms of the GNU Lesser General Public License as published by
*  the Free Software Foundation, either version 3 of the License, or
*  (at your option) any later version.
*  
*  FNLP is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU Lesser General Public License for more details.
*  
*  You should have received a copy of the GNU General Public License
*  along with FudanNLP.  If not, see .
*  
*  Copyright 2009-2014 www.fnlp.org. All rights reserved. 
*/

package org.fnlp.util;
/*
 * 文件名:WordCount.java
 * 版权:Copyright 2008-20012 复旦大学 All Rights Reserved.
 * 描述:程序总入口
 * 修改人:xpqiu
 * 修改时间:2009-1-5
 * 修改内容:新增
 *
 * 修改人:〈修改人〉
 * 修改时间:YYYY-MM-DD
 * 跟踪单号:〈跟踪单号〉
 * 修改单号:〈修改单号〉
 * 修改内容:〈修改内容〉
 */

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * @version 1.0
 * @since 1.0
 */
public class AV {

	HashMap left;
	HashMap right;

	HashMap av;
	boolean isSpace = false;
	private int count=0;
	int maxLen = 4;

	public AV() {
		left = new HashMap();
		right = new HashMap();
		av = new HashMap();
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {

		AV fm = new AV();
//		String fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/Training-Unlabelled-B.txt";
		String fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Training-Labelled.txt";
		fm.read(fileName);
		fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Training-Unlabelled-B.txt";
		fm.read(fileName);
		fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Test-B-Simplified.txt";
		fm.read(fileName);
		fm.calcAV();
		fm.save("D:/xpqiu/项目/自选/CLP2010/CWS/av-b-lut.txt", true);
		System.out.println("Done");

	}

	private void calcAV() {
		System.out.println("count: "+left.size());
		Iterator it = left.keySet().iterator();		
		while(it.hasNext()){
			String key = it.next();
			Double l = Math.log(left.get(key).size());
			Double r = Math.log(right.get(key).size());
			av.put(key, (int)Math.min(l, r));
		}
		System.out.println("av count: "+av.size());
	}

	/**
	 * @param fileName
	 */
	public void read(String fileName) {
		File f = new File(fileName);
		if (f.isDirectory()) {
			File[] files = f.listFiles();
			for (int i = 0; i < files.length; i++) {
				read(files[i].toString());
			}
		} else {
			try {
				InputStreamReader read = new InputStreamReader(
						new FileInputStream(fileName), "utf-8");
				BufferedReader bin = new BufferedReader(read);
				String sent;
				while ((sent = bin.readLine()) != null) {
					calc(sent);
				}
			} catch (Exception e) {
				e.printStackTrace();

			}
		}

	}

	/**
	 * @param filename
	 * @param bcount 是否输出词频
	 */
	public void save(String filename, boolean bcount) {


		try {
			FileOutputStream fos = new FileOutputStream(filename);
			BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(
					fos, "UTF-8"));
			Map.Entry[] entries= getSortedHashtableByValue(av); 
			for(int i=0;i




© 2015 - 2025 Weber Informatics LLC | Privacy Policy