
org.fnlp.util.AV Maven / Gradle / Ivy
/**
* This file is part of FNLP (formerly FudanNLP).
*
* FNLP is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* FNLP is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with FudanNLP. If not, see .
*
* Copyright 2009-2014 www.fnlp.org. All rights reserved.
*/
package org.fnlp.util;
/*
* 文件名:WordCount.java
* 版权:Copyright 2008-20012 复旦大学 All Rights Reserved.
* 描述:程序总入口
* 修改人:xpqiu
* 修改时间:2009-1-5
* 修改内容:新增
*
* 修改人:〈修改人〉
* 修改时间:YYYY-MM-DD
* 跟踪单号:〈跟踪单号〉
* 修改单号:〈修改单号〉
* 修改内容:〈修改内容〉
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* @version 1.0
* @since 1.0
*/
public class AV {
HashMap left;
HashMap right;
HashMap av;
boolean isSpace = false;
private int count=0;
int maxLen = 4;
public AV() {
left = new HashMap();
right = new HashMap();
av = new HashMap();
}
/**
* @param args
*/
public static void main(String[] args) {
AV fm = new AV();
// String fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/Training-Unlabelled-B.txt";
String fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Training-Labelled.txt";
fm.read(fileName);
fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Training-Unlabelled-B.txt";
fm.read(fileName);
fileName = "D:/xpqiu/项目/自选/CLP2010/CWS/data/Test-B-Simplified.txt";
fm.read(fileName);
fm.calcAV();
fm.save("D:/xpqiu/项目/自选/CLP2010/CWS/av-b-lut.txt", true);
System.out.println("Done");
}
private void calcAV() {
System.out.println("count: "+left.size());
Iterator it = left.keySet().iterator();
while(it.hasNext()){
String key = it.next();
Double l = Math.log(left.get(key).size());
Double r = Math.log(right.get(key).size());
av.put(key, (int)Math.min(l, r));
}
System.out.println("av count: "+av.size());
}
/**
* @param fileName
*/
public void read(String fileName) {
File f = new File(fileName);
if (f.isDirectory()) {
File[] files = f.listFiles();
for (int i = 0; i < files.length; i++) {
read(files[i].toString());
}
} else {
try {
InputStreamReader read = new InputStreamReader(
new FileInputStream(fileName), "utf-8");
BufferedReader bin = new BufferedReader(read);
String sent;
while ((sent = bin.readLine()) != null) {
calc(sent);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
/**
* @param filename
* @param bcount 是否输出词频
*/
public void save(String filename, boolean bcount) {
try {
FileOutputStream fos = new FileOutputStream(filename);
BufferedWriter bout = new BufferedWriter(new OutputStreamWriter(
fos, "UTF-8"));
Map.Entry[] entries= getSortedHashtableByValue(av);
for(int i=0;i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy