edu.ucla.sspace.tools.ChildesParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sspace-wordsi Show documentation
Show all versions of sspace-wordsi Show documentation
The S-Space Package is a collection of algorithms for building
Semantic Spaces as well as a highly-scalable library for designing new
distributional semantics algorithms. Distributional algorithms process text
corpora and represent the semantic for words as high dimensional feature
vectors. This package also includes matrices, vectors, and numerous
clustering algorithms. These approaches are known by many names, such as
word spaces, semantic spaces, or distributed semantics and rest upon the
Distributional Hypothesis: words that appear in similar contexts have
similar meanings.
The newest version!
/*
* Copyright 2009 Keith Stevens
*
* This file is part of the S-Space package and is covered under the terms and
* conditions therein.
*
* The S-Space package is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation and distributed hereunder to you.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES,
* EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE
* NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY
* PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION
* WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER
* RIGHTS.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
package edu.ucla.sspace.tools;
import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.util.HashMultiMap;
import edu.ucla.sspace.util.MultiMap;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* A simple xml parser for the Childes corpus. Words in each utterance will be
* extracted from the XML and saved into a specified file. The resulting
* document may consist of all uterances in an XML file or a single utterance,
* where a single xml file generates multiple documents.
*
* @author Keith Stevens
* @author David Jurgens
*/
public class ChildesParser {
/**
* A writer for writing utterances.
*/
private PrintWriter writer;
/**
* A writer for writing part of speech tags for words in Childes.
*/
private PrintWriter posWriter;
/**
* A map from strings to their parts of speech tags.
*/
private MultiMap posTags;
/**
* {@code true} if the parser should generate augmented utterances from the
* comments when parsing
*/
private final boolean generateAugmented;
/**
* {@code true} if the parser should separate sentences with periods.
*/
private final boolean separateByPeriod;
/**
* {@code true} if the parser should append pos tags to each token in the
* corpus. This is useful when the corpus needs the tags to be aligned with
* the text. The format of each token will be TOKEN-POS.
*/
private final boolean appendPosTags;
/**
* {@code true} if the parser shold generate one document for all the text
* processed.
*/
private final boolean generateOneDoc;
/**
* Creates the {@code ChildesParser}. The given file name will be used to
* write the extracted words.
*/
public ChildesParser(String outFile,
String posOutFile,
boolean generateAugmented,
boolean separateByPeriod,
boolean appendPosTags,
boolean generateOneDoc) {
this.generateAugmented = generateAugmented;
this.separateByPeriod = separateByPeriod;
this.appendPosTags = appendPosTags;
this.generateOneDoc = generateOneDoc;
posTags = new HashMultiMap();
try {
writer = new PrintWriter(outFile);
posWriter = new PrintWriter(posOutFile);
} catch (IOException ioe) {
throw new IOError(ioe);
}
}
/**
* Writes strings to the resulting file.
*/
private synchronized void print(String output) {
if (generateOneDoc)
writer.print(output);
else
writer.println(output);
}
/**
* Parses a single xml file. If {@code utterancePerDoc} is true, each
* utterance will be on a separate line, otherwise they will all be
* concantanated, and separated by periods, and stored on a single line.
*/
public void parseFile(File file, boolean utterancePerDoc) {
try {
// Build an xml document.
DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
DocumentBuilder db = dbf.newDocumentBuilder();
Document doc = db.parse(file);
// Extract all utterances.
NodeList utterances = doc.getElementsByTagName("u");
StringBuilder fileBuilder = new StringBuilder();
for (int i = 0; i < utterances.getLength(); ++i) {
// Extract all words from the utterance
Element item = (Element) utterances.item(i);
NodeList words = item.getElementsByTagName("w");
StringBuilder utteranceBuilder = new StringBuilder();
// Iterate over the words and get just the word text.
List wordsInUtterance =
new ArrayList(words.getLength());
for (int j = 0; j < words.getLength(); ++j) {
// Get the part of speech tag.
Element wordNode = (Element) words.item(j);
NodeList posNodeList = wordNode.getElementsByTagName("pos");
String word = wordNode.getFirstChild().getNodeValue();
if (posNodeList.getLength() > 0) {
Node posNode =
posNodeList.item(0).getFirstChild().getFirstChild();
String pos = posNode.getNodeValue();
posTags.put(word, pos);
if (appendPosTags)
word += "-" + pos;
}
wordsInUtterance.add(word);
}
// Each of the nodes contains additional information about
// the currnet utterances. This may be syntactic information,
// comments on the scene, descriptions of the action, or
// clarification by the observer. For all comments but the
// syntactic, use the comment text to create new pseudo
// utterances by combining tokens from the utterance with
// pseudo-tokens in the comment. The pseudo-tokens have a
// "-GROUNDING" suffix which distiguishes them from tokens
// actually present in the uttered speech.
NodeList auxNodes = item.getElementsByTagName("a");
List augmentedUtterances = new LinkedList();
if (generateAugmented) {
for (int j = 0; j < auxNodes.getLength(); ++j) {
// Get any comment for the utterance
Node n = auxNodes.item(j);
String auxNodeType = n.getAttributes().
getNamedItem("type").getNodeValue();
// Get only those nodes that contain comments or
// descriptions on the utterance that may be used to
// ground the words being referred to.
if (auxNodeType.equals("action")
|| auxNodeType.equals("actions")
|| auxNodeType.equals("addressee")
|| auxNodeType.equals("comments")
|| auxNodeType.equals("explanation")
|| auxNodeType.equals("gesture")
|| auxNodeType.equals("happening")
|| auxNodeType.equals("situation")) {
String commentOnUtterance =
n.getFirstChild().getNodeValue();
// Use the iterator factory to tokenize in the event
// that the user has specified some form of token
// filtering
Iterator tokenIter =
IteratorFactory.tokenize(
new BufferedReader(
new StringReader(
commentOnUtterance)));
// For each of the tokens in the additional
// information, create a pseudo-utterance using a
// word from the actual utterance and an
// grounding-token
while (tokenIter.hasNext()) {
String token = tokenIter.next();
for (String word : wordsInUtterance) {
augmentedUtterances.add(word + " "
+ token + "-GROUNDING");
}
}
}
}
}
// Write the utterance if an utterance is a document.
for (Iterator it = wordsInUtterance.iterator();
it.hasNext(); ) {
utteranceBuilder.append(it.next());
if (it.hasNext())
utteranceBuilder.append(" ");
}
String utterance = utteranceBuilder.toString();
if (utterancePerDoc) {
print(utterance);
// Print all the psuedo utterances constructed from the
// comments
for (String aug : augmentedUtterances)
print(aug);
}
else { // otherwise save the utterance.
fileBuilder.append(utterance);
if (separateByPeriod)
fileBuilder.append(".");
fileBuilder.append(" ");
// Print all the psuedo utterances constructed from the
// comments. Unlike the utterances, print these as separate
// documents to avoid having them register as co-occurrences
// with other utterances.
for (String aug : augmentedUtterances)
print(aug);
}
}
// Write all the utterances if the whole xml file is a document.
if (!utterancePerDoc)
print(fileBuilder.toString());
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* Finalizes the writing of documents.
*/
public void finish() {
for (Map.Entry entry : posTags.entrySet()) {
posWriter.println(entry.getKey() + " " + entry.getValue());
}
posWriter.flush();
posWriter.close();
writer.flush();
writer.close();
}
public static void main(String[] args) {
// Add the options.
ArgOptions options = new ArgOptions();
options.addOption('p', "partOfSpeechTag",
"If set, each token will be appended with it's " +
"part of speech tag, such as cat-noun",
false, null, "Optional");
options.addOption('S', "separateByPeriod",
"If set, seperates sentences by periods",
false, null, "Optional");
options.addOption('U', "utterancePerDoc",
"If set, one utterance is considered a document, " +
"otherwise all uterances in a file will be " +
"considered a document",
false, null, "Optional");
options.addOption('g', "generateOneDoc",
"If set, only one document will be generated for " +
"all the text processed",
false, null, "Optional");
options.addOption('A', "augmentedUtterances",
"Generates augmented utterances from comments " +
"about the utterances", false, null, "Augmented");
options.addOption('F', "augmentedUtterancesFilter",
"Specifes a token filter for which tokens in " +
"comments are used to generate augmented utterances",
true, "SPEC", "Augmented");
options.addOption('d', "baseChildesDirectory",
"The base childes directory. XML files will be " +
"searched for recursively from this base. Use of " +
"this overrides the fileList option.",
true, "DIRECTORY", "Required (At least one of)");
options.addOption('f', "fileList",
"The list of files to process",
true, "FILE[,FILE]*", "Required (At least one of)");
// Process the options and emit errors if any required options are
// missing.
options.parseOptions(args);
if ((!options.hasOption("fileList") &&
!options.hasOption("baseChildesDirectory")) ||
options.numPositionalArgs() != 2) {
System.out.println(
"usage: java ChildesParser [options] " +
" \n" +
options.prettyPrint());
return;
}
// The default is to have all utterances from a conversation be in a
// single document
boolean utterancePerDoc = false;
utterancePerDoc = options.hasOption("utterancePerDoc");
boolean genAugmented = options.hasOption("augmentedUtterances");
if (genAugmented && options.hasOption("augmentedUtterancesFilter")) {
String filterConf =
options.getStringOption("augmentedUtterancesFilter");
Properties p = System.getProperties();
p.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY, filterConf);
IteratorFactory.setProperties(p);
}
ChildesParser parser = new ChildesParser(options.getPositionalArg(0),
options.getPositionalArg(1),
genAugmented,
options.hasOption('S'),
options.hasOption('p'),
options.hasOption('g'));
// Process the given file list, if provided.
if (options.hasOption("fileList")) {
String[] files = options.getStringOption("fileList").split(",");
for (String file : files)
parser.parseFile(new File(file), utterancePerDoc);
} else {
// Otherwise search for xml files to process.
File baseDir =
new File(options.getStringOption("baseChildesDirectory"));
findXmlFiles(parser, utterancePerDoc, baseDir);
}
parser.finish();
}
/**
* Recursively finds any xml documents to parse.
*/
public static void findXmlFiles(ChildesParser parser,
boolean utterancePerDoc,
File directory) {
File[] files = directory.listFiles();
for (File file : files) {
if (file.isDirectory())
findXmlFiles(parser, utterancePerDoc, file);
else if (file.isFile() && file.getPath().endsWith(".xml"))
parser.parseFile(file, utterancePerDoc);
}
}
}