cc.mallet.pipe.CharSequenceRemoveHTML Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!

/* Copyright (C) 2006 Univ. of Massachusetts Amherst, Computer Science Dept.
 This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
 http://www.cs.umass.edu/~mccallum/mallet 
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */

package cc.mallet.pipe;


import javax.swing.text.html.*;

import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;

import java.io.*;

/**
 * This pipe removes HTML from a CharSequence. The HTML is actually parsed here,
 * so we should have less HTML slipping through... but it is almost certainly
 * much slower than a regular expression, and could fail on broken HTML.
 * 
 * @author Greg Druck [email protected]
 */

public class CharSequenceRemoveHTML extends Pipe {

	public Instance pipe(Instance carrier) {
		String text = ((CharSequence) carrier.getData()).toString();
		
		// I take these out ahead of time because the
		// Java HTML parser seems to die here.
		text = text.replaceAll("\\","");
		text = text.replaceAll("\\<\\/NOFRAMES\\>","");
		
		ParserGetter kit = new ParserGetter();
		HTMLEditorKit.Parser parser = kit.getParser();
		HTMLEditorKit.ParserCallback callback = new TagStripper();

		try {
			StringReader r = new StringReader(text);
			parser.parse(r, callback, true);
		} catch (IOException e) {
			System.err.println(e);
		}
		String result = ((TagStripper) callback).getText();
		carrier.setData((CharSequence) result);
		return carrier;
	}

	private class TagStripper extends HTMLEditorKit.ParserCallback {
		private String text;

		public TagStripper() {
			text = "";
		}

		public void handleText(char[] txt, int position) {
			for (int index = 0; index < txt.length; index++) {
				text += txt[index];
			}
			text += "\n";
		}

		public String getText() {
			return text;
		}

	}

	private class ParserGetter extends HTMLEditorKit {
		// purely to make this method public
		public HTMLEditorKit.Parser getParser() {
			return super.getParser();
		}
	}

	public static void main(String[] args) {
		String htmldir = args[0];
		Pipe pipe = new SerialPipes(new Pipe[] { new Input2CharSequence(),
				new CharSequenceRemoveHTML() });
		InstanceList list = new InstanceList(pipe);
		list.addThruPipe(new FileIterator(htmldir, FileIterator.STARTING_DIRECTORIES));

		for (int index = 0; index < list.size(); index++) {
			Instance inst = list.get(index);
			System.err.println(inst.getData());
		}

	}

}