All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.NGramPreprocessor Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.pipe;

import java.util.*;
import java.io.*;

import cc.mallet.util.*;
import cc.mallet.types.*;

/** This pipe changes text to lowercase, removes common XML entities (quot, apos, lt, gt), and replaces all punctuation
	except the - character with whitespace. It then breaks up tokens on whitespace and applies n-gram token replacements
	and deletions. Replacements are applied in the order they are specified, first by file and then within files.
*/

public class NGramPreprocessor extends Pipe implements Serializable {

	public ArrayList replacementSets;

	public NGramPreprocessor () {
		replacementSets = new ArrayList();
	}

	public int loadReplacements(String filename) throws IOException {
		BufferedReader in = new BufferedReader(new FileReader(filename));
		
		ReplacementSet set = new ReplacementSet();

		int totalReplacements = 0;
		
		String line;
		while ((line = in.readLine()) != null) {
			set.addReplacement(new Replacement(line));
			totalReplacements++;
		}
		in.close();
		
		replacementSets.add(set);
		
		return totalReplacements;
	}
	
	public int loadDeletions(String filename) throws IOException {
		BufferedReader in = new BufferedReader(new FileReader(filename));
			
		ReplacementSet set = new ReplacementSet();

		int totalReplacements = 0;
		
		String line;
		while ((line = in.readLine()) != null) {
			set.addReplacement(new Replacement(line, ""));
			totalReplacements++;
		}
		in.close();
		
		replacementSets.add(set);

		return totalReplacements;
	}

	public Instance pipe (Instance instance) {

		String input = (String) instance.getData();

		input = input.toLowerCase();

		//System.out.println(input);

		// Remove XML entities
		input = input.replaceAll("'", "'");
		input = input.replaceAll(""", "\"");
		input = input.replaceAll(">", "<");
		input = input.replaceAll("<", ">");

		// Remove all non letter/number characters except dash and apostrophe
		input = input.replaceAll("[^\\p{L}\\p{N}\\-\\']", " ");

		// Reduce all blocks of whitespace to a single space
		input = input.replaceAll("\\s+", " ");
		
		for (ReplacementSet set: replacementSets) {
			input = set.applyReplacements(input);
		}
	
		//System.out.println(input);
	
		instance.setData(input);

		return instance;
		
	}

	private static final long serialVersionUID = 1;
	private static final int CURRENT_SERIAL_VERSION = 1;
	
	private void writeObject(ObjectOutputStream out) throws IOException {
		out.writeInt (CURRENT_SERIAL_VERSION);
		out.writeObject(replacementSets);
	}
    
	private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
		int version = in.readInt ();
		replacementSets = (ArrayList) in.readObject();
	}

	public class ReplacementSet implements Serializable {
		HashMap> replacementIndex;

		public ReplacementSet() {
			replacementIndex = new HashMap>();
		}
		
		public void addReplacement(Replacement replacement) {
			String key = replacement.getFirstToken();
			
			if (! replacementIndex.containsKey(key)) {
				replacementIndex.put(key, new ArrayList());
			}
			
			replacementIndex.get(key).add(replacement);
		}

		public String applyReplacements(String input) {
			String[] tokens = input.split(" ");
			StringBuilder output = new StringBuilder();

			int position = 0;
			while (position < tokens.length) {
				String token = tokens[position];
				
				int initialPosition = position;
				
				if (replacementIndex.containsKey(token)) {
					for (Replacement replacement: replacementIndex.get(token)) {
						position = replacement.apply(tokens, position, output);
						if (position > initialPosition) {
							break;
						}
					}
				}
				
				if (position == initialPosition) {
					// No replacements were made, so nothing
					//  has been output.
					output.append(token + " ");
					position ++;
				}
			}

			return output.toString();
		}

		private static final long serialVersionUID = 1;
		private static final int CURRENT_SERIAL_VERSION = 1;
		
		private void writeObject(ObjectOutputStream out) throws IOException {
			out.writeInt (CURRENT_SERIAL_VERSION);
			out.writeObject(replacementIndex);
		}
        
		private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
			int version = in.readInt ();
			replacementIndex = (HashMap>) in.readObject();
		}

	}
	
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy