All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nlp.MBoxReader Maven / Gradle / Ivy

Go to download

Natural language processing toolbox using Sigma knowledge engineering system.

There is a newer version: 1.1
Show newest version
package nlp;

import com.articulate.sigma.StringUtil;
import semRewrite.datesandnumber.InterpretNumerics;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by apease on 7/23/16.
 */
public class MBoxReader {

    //Enumeration of the property names we'll use:
    public static String PROP_NAME_SENDER		= "Sender";
    public static String PROP_NAME_DATE			= "MessageDate";
    public static String PROP_NAME_SENDER_INFO	= "SenderInfo";
    public static String PROP_NAME_BODY			= "Body";
    public static String PROP_NAME_FACTS		= "Facts";
    public static HashSet> records = new HashSet<>();

    /** ***************************************************************
     * This regular expression will be used to extract fields from the From
     * line of each message. It matches the word "From" followed by a
     * space, followed by a sequence of non-whitespace characters which
     * constitute the sender, followed by some whitespace and a string of
     * exactly 24 characters which consitutes the date and possibly followed
     * by a string of characters containing other information about the
     * sender.
     */
    private static Pattern fromLineRegex
            = Pattern.compile("From (\\S*)\\s*(.{24})(.*)");

    /** ***************************************************************
     */
    public void execute(String path) {

        //Get the paths of the mbox files to process:
        ArrayList mboxFiles = new ArrayList<>();
        File folder = new File(path);
        if (!folder.exists()) {
            System.out.println("Error in MBoxReader.execute(): '" + folder + "' doesn't exist ");
            return;
        }
        File[] listOfFiles;
        if (folder.isFile())
            mboxFiles.add(path);
        else {
            listOfFiles = folder.listFiles();

            for (File file : listOfFiles) {
                if (file.getName().indexOf(".") == -1)
                    mboxFiles.add(file.getName());
                if (file.getName().indexOf(".sbd") != -1)
                    execute(folder.getName() + File.separator + file.getName());
            }

            if (mboxFiles == null)
                System.out.println("Error inMBoxReader.execute: You must specify at least one filename.");
        }
        // Now that we have processed the configuration, we're ready to
        // ;oop over each of the files to parse:
        for (String file : mboxFiles) {
            try {
                //Open the current file:
                BufferedReader reader = new BufferedReader(new FileReader(file));
                // Process the current file. The first line of each message should be
                // "From   ":
                String curFromLine = reader.readLine();
                if (curFromLine == null) {
                    System.out.println("mbox file '" + file + "' was empty.");
                    continue; //Continue on to next file.
                }

                //Loop over the messages in the file:
                while (curFromLine != null) {
                    //Report our progress:
                    System.out.println("Processing message: " + curFromLine + "...");

                    //Extract fields from the from line:
                    Matcher matcher = fromLineRegex.matcher(curFromLine);
                    if (!matcher.matches()) {
                        System.out.println("Invalid From line syntax in file '"
                                + file + "': " + curFromLine);
                        break; //Abort this file.
                    }

                    String sender = matcher.group(1);
                    String date = matcher.group(2);
                    String senderInfo = matcher.group(3);

                    //Create a new Record for this message and add the from line
                    //fields as properties:
                    HashMap record = new HashMap();

                    record.put(PROP_NAME_SENDER, sender);
                    record.put(PROP_NAME_DATE, date);

                    if (!senderInfo.equals(""))
                        record.put(PROP_NAME_SENDER_INFO, senderInfo);

                    processHeaders(reader, record); //process the message headers.

                    //The rest of the message is the message body. This method will
                    //read that in, add it to the record and return the From line of
                    //the next message, if any:
                    curFromLine = processBody(reader, record);
                    records.add(record); //Emit the completed record.
                }
                reader.close(); //close the current file.
            }
            catch (IOException e) {
                //There was a problem processing the current file, but maybe the
                //others will work; we'll log an error and continue:
                System.out.println("Error processing mbox file '" + file + "': "
                        + e.getMessage());
            }
        }
    }

    /** ***************************************************************
     */
    private void processHeaders(BufferedReader reader, HashMap record) {

        try {
            //Loop until we reach a blank line, which indicates the end of the
            //headers, or we reach the end of the input stream:
            while (true) {
                String line = reader.readLine();
                if (line == null)
                    break;
                if (line.equals(""))
                    break;

                //Each header has the form "Name: value". Extract the name and
                //value from the current header:
                int colonPos = line.indexOf(':');
                if (colonPos == -1) {
                    System.out.println("Invalid message header format. Expected a colon in "
                            + "the line '" + line + "'");
                    continue; //Move on to next header.
                }
                record.put(line.substring(0,colonPos),line.substring(colonPos + 1));
            }
        }
        catch (IOException ioe) {
            System.out.println("Error in MBoxReader.processHeaders()");
            ioe.printStackTrace();
        }
    }

    /** ***************************************************************
     * Beginning at the current position of the reader, this method reads
     * in a message body until it reaches a blank line followed by a "From"
     * line indicating the start of the next message, or the stream runs out
     * of data. Once it is done reading in the body, it adds the body text
     * to the specified record as a property, and returns the "From" line of
     * the next message, if any.
     *
     * @param reader  Reader to read body from
     * @param record  Record to add body to
     *
     * @return  The "From" line of the next message in the reader stream, or
     *          null if there are no more messages
     *
     * @throws IOException
     */
    private String processBody(BufferedReader reader, HashMap record) {

        String body = "";
        String fromLine = null;
        try {
            while (true) {
                String line = reader.readLine();
                if (line == null)
                    break;
                if (line.equals("")) {
                    fromLine = reader.readLine();
                    if (fromLine == null)
                        break;
                    //If the line begins with "From " then it is a From line:
                    if (fromLine.regionMatches(true, 0, "From ", 0, 5))
                        break; //A new message was found.
                    //not a from line...
                    line += fromLine;
                    fromLine = null;
                }
                body += line; //Append line to body.
            }
            record.put(PROP_NAME_BODY, body); // Add the body to the record:
        }
        catch (IOException ioe) {
            System.out.println("Error in MBoxReader.processBody()");
            ioe.printStackTrace();
        }
        return fromLine;
    }

    /** ***************************************************************
     */
    public void extractInfo() {

        System.out.println("In MBoxReader.extractInfo()");
        for (HashMap element : records) {
            if (element.keySet().contains(PROP_NAME_BODY)) {
                String body = StringUtil.removeHTML(element.get(PROP_NAME_BODY));
                System.out.println("In MBoxReader.extractInfo() from " + body);
                List results = InterpretNumerics.getSumoTerms(body);
                System.out.println("INFO in MBoxReader.extractInfo(): " + results);
            }
        }
    }

    /** ***************************************************************
     */
    public static void main(String[] args) {

        MBoxReader mbr = new MBoxReader();
        if (args.length > 0) {
            if (args[0].contains("-h")) {
                System.out.println("Usage: java -classpath . com.articulate.sigma.util.MBoxReader -f file");
            }
            else if (args[0].equals("-f") && args.length > 1) {
                mbr.execute(args[1]);
                mbr.extractInfo();
                //System.out.println(records);
            }
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy