All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.cluster.tui.Text2Clusterings Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.cluster.tui;

import gnu.trove.TIntArrayList;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.logging.Logger;

import cc.mallet.cluster.Clustering;
import cc.mallet.cluster.Clusterings;
import cc.mallet.cluster.Record;
import cc.mallet.pipe.Noop;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;

//In progress
public class Text2Clusterings {

	private static Logger logger =
			MalletLogger.getLogger(Text2Clusterings.class.getName());

	public static void main (String[] args) throws IOException {
		CommandOption
									.setSummary(Text2Clusterings.class,
															"A tool to convert a list of text files to a Clusterings.");
		CommandOption.process(Text2Clusterings.class, args);

		if (classDirs.value.length == 0) {
			logger
						.warning("You must include --input DIR1 DIR2 ...' in order to specify a"
											+ "list of directories containing the documents for each class.");
			System.exit(-1);
		}

		Clustering[] clusterings = new Clustering[classDirs.value.length];
		int fi = 0;
		for (int i = 0; i < classDirs.value.length; i++) {
			Alphabet fieldAlph = new Alphabet();
			Alphabet valueAlph = new Alphabet();
			File directory = new File(classDirs.value[i]);
			File[] subdirs = getSubDirs(directory);
			Alphabet clusterAlph = new Alphabet();
			InstanceList instances = new InstanceList(new Noop());
			TIntArrayList labels = new TIntArrayList();
			for (int j = 0; j < subdirs.length; j++) {
				ArrayList records = new FileIterator(subdirs[j]).getFileArray();
				int label = clusterAlph.lookupIndex(subdirs[j].toString());
				for (int k = 0; k < records.size(); k++) {
					if (fi % 100 == 0) System.out.print(fi);
					else if (fi % 10 == 0) System.out.print(".");
					if (fi % 1000 == 0 && fi > 0) System.out.println();
					System.out.flush();
					fi++;


					File record = records.get(k);
					labels.add(label);
					instances.add(new Instance(new Record(fieldAlph, valueAlph, parseFile(record)),
												new Integer(label), record.toString(),
												record.toString()));
				}
			}
			clusterings[i] =
					new Clustering(instances, subdirs.length, labels.toNativeArray());
		}

		logger.info("\nread " + fi + " objects in " + clusterings.length + " clusterings.");
		try {
			ObjectOutputStream oos =
					new ObjectOutputStream(new FileOutputStream(outputFile.value));
			oos.writeObject(new Clusterings(clusterings));
			oos.close();
		} catch (Exception e) {
			logger.warning("Exception writing clustering to file " + outputFile.value
											+ " " + e);
			e.printStackTrace();
		}

	}

	public static File[] getSubDirs (File dir) throws IOException {
		ArrayList ret = new ArrayList();
		File[] fs = dir.listFiles();
		for (File f : fs)
			if (f.isDirectory() && !f.getName().matches("^\\.+$"))
				ret.add(f);
		return ret.toArray(new File[] {});
	}

	public static String[][] parseFile (File f) throws IOException {
		BufferedReader r = new BufferedReader(new FileReader(f));
		String line = "";
		ArrayList lines = new ArrayList();
		while ((line = r.readLine()) != null) {
			line = line.trim();
			String[] words = line.split("\\s+");
			if (words.length > 1)
				lines.add(words);
		}
		String[][] ret = new String[lines.size()][];
		for (int i = 0; i < lines.size(); i++)
			ret[i] = lines.get(i);
		return ret;
	}

	static CommandOption.SpacedStrings classDirs =
			new CommandOption.SpacedStrings(
																			Text2Clusterings.class,
																			"input",
																			"DIR...",
																			true,
																			null,
																			"The directories containing text files to be clustered, one directory per clustering",
																			null);

	static CommandOption.String outputFile =
			new CommandOption.String(Text2Clusterings.class, "output", "FILENAME",
																true, "text.clusterings",
																"The filename to write the Clustering.", null);

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy