nlp.corpora.SwitchboardDialogueDB Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of sigma-nlp Show documentation
Show all versions of sigma-nlp Show documentation
Natural language processing toolbox using Sigma knowledge engineering system.
package nlp.corpora;
import com.opencsv.CSVReader;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
/*
Copyright 2017 Cloudminds Technology, Inc
Author: [email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA 02111-1307 USA
*/
/****************************************************************************************
* Created by Vishwas Mruthyunjaya on 1/4/17.
*/
public class SwitchboardDialogueDB {
// Global Properties for SwitchDialogueDB Object.
private String directoryName = "";
private Path addToDirectory;
private List foldersList = new ArrayList<>();
private List filesList = new ArrayList<>();
/************************************************************************************
* This method takes the converted data and
* stores in a text file format which is required for the ChatBot.
*/
private void writeFile(List rawData, String fileName, String folderName) {
String addToFileName;
String addToFolderName;
String addToFilePath;
Path addToFolderPath;
try {
addToFileName = StringUtils.substringBefore(FilenameUtils.getBaseName(fileName), ".");
addToFolderName = folderName.substring(folderName.lastIndexOf("/") + 1);
addToFolderPath = Paths.get(addToDirectory+"/"+addToFolderName+"_parsed/");
Files.createDirectories(addToFolderPath);
addToFilePath = addToFolderPath+"/"+addToFileName+"_parsed.txt";
Path file = Paths.get(addToFilePath);
Files.write(file, rawData, Charset.forName("UTF-8"));
}
catch (Exception ex) {
System.out.println (ex.toString());
System.out.println("writeFile() Method: Could not find the file");
}
}
/************************************************************************************
* This method is to read the file(s) from Switchboard Dialogue Database and extract
* the necessary column as well as convert them into required one linguistic turn format.
*/
private void readFile(String filePath, String folderPath) {
// Initialise parameters
String [] data;
StringBuilder buf = new StringBuilder();
List rawData = new ArrayList<>();
// Identified conversation column and conversing agents column in the CSV file
int conversationColumnIndex = 8;
int conversingAgentsColumnIndex = 7;
// CSV file reading and formatting
try {
// CSV File reader
CSVReader reader = new CSVReader(new FileReader(filePath));
// This skips the header row in CSV file
reader.readNext();
// Formatting the CSV file data to required style
while ((data = reader.readNext()) != null) {
// data[] is an array of values from the line
data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("[{][a-zA-Z0-9]","");
data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("[{\\-}\\[\\]\"/+()]","");
data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("( )+"," ");
if (Integer.parseInt(data[conversingAgentsColumnIndex]) == 1 && buf.length() > 0) {
rawData.add(buf.toString().trim());
buf.setLength(0);
}
buf.append(data[conversationColumnIndex].trim());
buf.append(" ");
}
if (buf.length() > 0 ) rawData.add(buf.toString().trim());
// Invoke writeFile method to write the converted data into a .txt file
writeFile(rawData, filePath, folderPath);
}
catch (IOException ex) {
System.out.println (ex.toString());
System.out.println("Warning: In method readFile(), could not find the file");
}
catch (Exception e) {
System.out.println("Warning: In method readFile(), error in data formatting");
}
}
/*************************************************************************************
* listFolders() method takes a directory path as argument and lists all the folder
* under the given folder path.
* RETURN --> list of absolute folder paths.
*/
private List listFolders(String directoryPath) {
List foldersPath = new ArrayList<>();
try {
File directory = new File(directoryPath);
//get all the folders from a directory
File[] folderList = directory.listFiles();
if (folderList != null) {
for (File file : folderList) {
if (file.isDirectory()) {
foldersPath.add(file.getAbsolutePath());
}
}
}
}
catch (Exception e) {
System.out.println("Warning: In method listFolders(), error in listing folders under: " + directoryPath);
}
return foldersPath;
}
/*************************************************************************************
* listFiles() method takes a folder path as argument and lists all the files under
* the given folder path.
* RETURN --> list of absolute file paths.
*/
private List listFiles(String folderPath) {
List filesPath = new ArrayList<>();
try {
File directory = new File(folderPath);
//get all the files from a folder
File[] filesList = directory.listFiles();
if (filesList != null) {
for (File file : filesList) {
if (file.isFile() && FilenameUtils.getExtension(file.toString()).equalsIgnoreCase("csv")) {
filesPath.add(file.getAbsolutePath());
}
}
}
}
catch (Exception e) {
System.out.println("Warning: In method listFiles(), error in listing the files under: " + folderPath);
}
return filesPath;
}
/**************************************************************************************
* main() instantiates class and runs functionality
*/
public static void main(String[] args) throws IOException {
SwitchboardDialogueDB switchboardDialogueDBObject = new SwitchboardDialogueDB();
Properties prop = new Properties();
try {
InputStream input = new FileInputStream("corpora.properties");
prop.load(input);
}
catch (IOException e) {
System.out.println("Problem loading resource file " + e);
e.printStackTrace();
}
switchboardDialogueDBObject.directoryName = prop.getProperty("switchboardDialogueDirectoryName");
switchboardDialogueDBObject.addToDirectory = Paths.get(prop.getProperty("switchboardDialogueParsedDirectoryName"));
Files.createDirectories(switchboardDialogueDBObject.addToDirectory);
switchboardDialogueDBObject.foldersList = switchboardDialogueDBObject.listFolders(switchboardDialogueDBObject.directoryName);
for (String folderPath: switchboardDialogueDBObject.foldersList) {
switchboardDialogueDBObject.filesList = switchboardDialogueDBObject.listFiles(folderPath);
for (String filePath: switchboardDialogueDBObject.filesList) {
switchboardDialogueDBObject.readFile(filePath, folderPath);
}
}
}
}