All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nlp.corpora.SwitchboardDialogueDB Maven / Gradle / Ivy

Go to download

Natural language processing toolbox using Sigma knowledge engineering system.

There is a newer version: 1.1
Show newest version
package nlp.corpora;

import com.opencsv.CSVReader;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

/*
Copyright 2017 Cloudminds Technology, Inc

Author: [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program ; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston,
MA  02111-1307 USA
 */

/****************************************************************************************
 * Created by Vishwas Mruthyunjaya on 1/4/17.
 */
public class SwitchboardDialogueDB {

    // Global Properties for SwitchDialogueDB Object.
    private String directoryName = "";
    private Path addToDirectory;
    private List foldersList = new ArrayList<>();
    private List filesList = new ArrayList<>();

    /************************************************************************************
     * This method takes the converted data and
     * stores in a text file format which is required for the ChatBot.
     */
    private void writeFile(List rawData, String fileName, String folderName) {

        String addToFileName;
        String addToFolderName;
        String addToFilePath;
        Path addToFolderPath;
        try {
            addToFileName = StringUtils.substringBefore(FilenameUtils.getBaseName(fileName), ".");
            addToFolderName = folderName.substring(folderName.lastIndexOf("/") + 1);
            addToFolderPath = Paths.get(addToDirectory+"/"+addToFolderName+"_parsed/");
            Files.createDirectories(addToFolderPath);
            addToFilePath = addToFolderPath+"/"+addToFileName+"_parsed.txt";
            Path file = Paths.get(addToFilePath);
            Files.write(file, rawData, Charset.forName("UTF-8"));
        }
        catch (Exception ex) {
            System.out.println (ex.toString());
            System.out.println("writeFile() Method: Could not find the file");
        }
    }

    /************************************************************************************
     * This method is to read the file(s) from Switchboard Dialogue Database and extract
     * the necessary column as well as convert them into required one linguistic turn format.
     */
    private void readFile(String filePath, String folderPath) {

        // Initialise parameters
        String [] data;
        StringBuilder buf = new StringBuilder();
        List rawData = new ArrayList<>();
        // Identified conversation column and conversing agents column in the CSV file
        int conversationColumnIndex = 8;
        int conversingAgentsColumnIndex = 7;

        // CSV file reading and formatting
        try {
            // CSV File reader
            CSVReader reader = new CSVReader(new FileReader(filePath));
            // This skips the header row in CSV file
            reader.readNext();
            // Formatting the CSV file data to required style
            while ((data = reader.readNext()) != null) {
                // data[] is an array of values from the line
                data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("[{][a-zA-Z0-9]","");
                data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("[{\\-}\\[\\]\"/+()]","");
                data[conversationColumnIndex] = data[conversationColumnIndex].replaceAll("( )+"," ");
                if (Integer.parseInt(data[conversingAgentsColumnIndex]) == 1 && buf.length() > 0) {
                    rawData.add(buf.toString().trim());
                    buf.setLength(0);
                }
                buf.append(data[conversationColumnIndex].trim());
                buf.append(" ");
            }
            if (buf.length() > 0 ) rawData.add(buf.toString().trim());
            // Invoke writeFile method to write the converted data into a .txt file
            writeFile(rawData, filePath, folderPath);
        }
        catch (IOException ex) {
            System.out.println (ex.toString());
            System.out.println("Warning: In method readFile(), could not find the file");
        }
        catch (Exception e) {
            System.out.println("Warning: In method readFile(), error in data formatting");
        }
    }

    /*************************************************************************************
     * listFolders() method takes a directory path as argument and lists all the folder
     * under the given folder path.
     * RETURN --> list of absolute folder paths.
     */
    private List listFolders(String directoryPath) {

        List foldersPath = new ArrayList<>();
        try {
            File directory = new File(directoryPath);
            //get all the folders from a directory
            File[] folderList = directory.listFiles();
            if (folderList != null) {
                for (File file : folderList) {
                    if (file.isDirectory()) {
                        foldersPath.add(file.getAbsolutePath());
                    }
                }
            }
        }
        catch (Exception e) {
            System.out.println("Warning: In method listFolders(), error in listing folders under: " + directoryPath);
        }
        return foldersPath;
    }

    /*************************************************************************************
     * listFiles() method takes a folder path as argument and lists all the files under
     * the given folder path.
     * RETURN --> list of absolute file paths.
     */
    private List listFiles(String folderPath) {
        
        List filesPath = new ArrayList<>();
        try {
            File directory = new File(folderPath);
            //get all the files from a folder
            File[] filesList = directory.listFiles();
            if (filesList != null) {
                for (File file : filesList) {
                    if (file.isFile() && FilenameUtils.getExtension(file.toString()).equalsIgnoreCase("csv")) {
                        filesPath.add(file.getAbsolutePath());
                    }
                }
            }
        }
        catch (Exception e) {
            System.out.println("Warning: In method listFiles(), error in listing the files under: " + folderPath);
        }
        return filesPath;
    }

    /**************************************************************************************
     * main() instantiates class and runs functionality
     */
    public static void main(String[] args) throws IOException {

        SwitchboardDialogueDB switchboardDialogueDBObject = new SwitchboardDialogueDB();
        Properties prop = new Properties();

        try {
            InputStream input = new FileInputStream("corpora.properties");
            prop.load(input);
        }
        catch (IOException e) {
            System.out.println("Problem loading resource file " + e);
            e.printStackTrace();
        }

        switchboardDialogueDBObject.directoryName = prop.getProperty("switchboardDialogueDirectoryName");
        switchboardDialogueDBObject.addToDirectory = Paths.get(prop.getProperty("switchboardDialogueParsedDirectoryName"));

        Files.createDirectories(switchboardDialogueDBObject.addToDirectory);
        switchboardDialogueDBObject.foldersList = switchboardDialogueDBObject.listFolders(switchboardDialogueDBObject.directoryName);
        for (String folderPath: switchboardDialogueDBObject.foldersList) {
            switchboardDialogueDBObject.filesList = switchboardDialogueDBObject.listFiles(folderPath);
            for (String filePath: switchboardDialogueDBObject.filesList) {
                switchboardDialogueDBObject.readFile(filePath, folderPath);
            }
        }

    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy