nlp.corpora.NPSChatCorpus Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package nlp.corpora;
import org.apache.commons.io.FilenameUtils;
import org.apache.xerces.dom.DeferredElementImpl;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
/**
* This code is copyright CloudMinds 2017.
* This software is released under the GNU Public License .
* Users of this code also consent, by use of this code, to credit Articulate Software
* and Teknowledge in any writings, briefings, publications, presentations, or
* other representations of any software which incorporates, builds on, or uses this
* code. Please cite the following article in any publication with references:
* Pease, A., (2003). The Sigma Ontology Development Environment,
* in Working Notes of the IJCAI-2003 Workshop on Ontology and Distributed Systems,
* August 9, Acapulco, Mexico.
*
* Created by charlescostello on 1/18/17.
* Class to parse NPS Chat corpus files and write to new files
* Data source: faculty.nps.edu/cmartell/NPSChat.htm
*/
public class NPSChatCorpus {
private String rawDirectoryName;
private String parsedDirectoryName;
/****************************************************************
* @param parsedLines are the parsed dialog lines
* Writes parsed lines to new file
*/
private void writeFile(List parsedLines, String fileName) {
try (BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(fileName))) {
for (String parsedLine : parsedLines) {
bufferedWriter.write(parsedLine);
bufferedWriter.newLine();
}
}
catch (IOException e) {
System.out.println("Error with" + fileName + ": " + e);
e.printStackTrace();
}
}
/****************************************************************
* @param fileName is the name of Ubuntu dialog file
* @return a list of raw dialog lines
* Reads dialog file contents into list of lines
*/
private List parseFile(String fileName) {
List lines = new ArrayList<>();
StringBuilder buffer = new StringBuilder();
String currentUser = "";
DocumentBuilder builder;
DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
try {
builder = builderFactory.newDocumentBuilder();
Document document = builder.parse(fileName);
document.getDocumentElement().normalize();
NodeList posts = document.getElementsByTagName("Post");
for (int i = 0; i < posts.getLength(); i++) {
String text = Arrays.stream(posts.item(i).getFirstChild().getNodeValue().split(" ")).filter(w -> !w.contains("sUser")).collect(Collectors.joining(" "));
text = text.contains(".ACTION") ? "<" + text.substring(8, (text.charAt(text.length() - 1) == '.') ? text.length() - 1 : text.length()) + ">" : text;
if (!text.trim().equals("PART") && !text.trim().equals("JOIN")) {
String user = ((DeferredElementImpl) posts.item(i)).getAttribute("user").split("sUser")[1];
if (!user.equals(currentUser)) {
if (buffer.toString().trim().length() > 0) lines.add(buffer.toString().trim());
buffer.setLength(0);
currentUser = user;
}
buffer.append(text.trim());
buffer.append(" ");
}
}
if (buffer.length() > 0) lines.add(buffer.toString().trim());
}
catch (ParserConfigurationException | IOException | SAXException e) {
e.printStackTrace();
}
return lines;
}
/****************************************************************
* Iterates through all files and runs file specific functionality
*/
private void parseAllFiles() {
File directory = new File(rawDirectoryName);
File[] files = directory.listFiles();
if (files != null) {
// Parse and write each file
for (File file : files) {
if (FilenameUtils.getExtension(file.getName()).equals("xml")) {
List parsedLines = parseFile(file.getAbsolutePath());
writeFile(parsedLines, parsedDirectoryName + file.getName());
}
}
}
}
/****************************************************************
* @param args command line arguments
* Instantiates class and runs functionality
*/
public static void main(String[] args) {
// Instantiate class
NPSChatCorpus npsChatCorpus = new NPSChatCorpus();
// Get paths from properties file
Properties prop = new Properties();
try {
InputStream input = new FileInputStream("corpora.properties");
prop.load(input);
}
catch (IOException e) {
System.out.println("Problem loading resource file " + e);
e.printStackTrace();
}
npsChatCorpus.rawDirectoryName = prop.getProperty("npsChatDirectoryName");
npsChatCorpus.parsedDirectoryName = prop.getProperty("npsChatParsedDirectoryName");
// Run functionality
npsChatCorpus.parseAllFiles();
}
}