com.legstar.cobol.AbstractCobolSourceCleaner Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright (c) 2010 LegSem.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the GNU Lesser Public License v2.1
 * which accompanies this distribution, and is available at
 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
 * 
 * Contributors:
 *     LegSem - initial API and implementation
 ******************************************************************************/
package com.legstar.cobol;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.legstar.antlr.CleanerException;

/**
 * In order to reduce the lexer/parser grammar complexity, this class will
 * remove all unnecessary characters from the original source. This way, the
 * ANTLR lexer will be presented with a purified source that only contains data
 * division entries.
 * 
 * This allows users to submit complete COBOL programs or fragments of COBOL
 * programs with non data description statements to the parser without the need
 * to add grammar rules for all these cases.
 * 
 */
public abstract class AbstractCobolSourceCleaner {

    /** Cobol sentences are assumed to be terminated by this character. */
    public static final char COBOL_DELIMITER = '.';

    /** Pattern that recognizes the start of a data description entry. */
    public static final Pattern DATA_DESCRIPTION_START = Pattern
            .compile("(^|\\s|\\" + COBOL_DELIMITER + ")\\d(\\d)?(\\s|\\"
                    + COBOL_DELIMITER + "|$)");

    /** Pattern that recognizes the end of a data description entry. */
    public static final Pattern DATA_DESCRIPTION_END = Pattern.compile("(\\"
            + COBOL_DELIMITER + "$)|(\\" + COBOL_DELIMITER + "\\s)");

    /** Pattern that recognizes the start of an alphanumeric literal. */
    public static final Pattern ALPHANUM_LITERAL_START = Pattern
            .compile("(^|\\s)[\\\"\']");

    /** Pattern that recognizes the start of a procedure division. */
    public static final Pattern PROCEDURE_DIVISION = Pattern.compile(
            "^(\\s)*PROCEDURE DIVISION", Pattern.CASE_INSENSITIVE);

    /** Pattern that recognizes the start of an identification division. */
    public static final Pattern IDENTIFICATION_DIVISION = Pattern.compile(
            "^(\\s)*ID(ENTIFICATION)? DIVISION", Pattern.CASE_INSENSITIVE);

    /** Pattern that recognizes the start of a data division. */
    public static final Pattern DATA_DIVISION = Pattern.compile(
            "^(\\s)*DATA DIVISION", Pattern.CASE_INSENSITIVE);

    /**
     * List of compiler directives (they can be period delimited but are
     * guaranteed to be alone on a line).
     */
    public static final List < String > COMPILER_DIRECTIVES = Arrays.asList(
            "EJECT", "SKIP", "SKIP1", "SKIP2", "SKIP3");

    /** Handles error messages. */
    private RecognizerErrorHandler _errorHandler;

    /**
     * Construct with a shared error handler.
     * 
     * @param errorHandler handles error messages
     */
    public AbstractCobolSourceCleaner(final RecognizerErrorHandler errorHandler) {
        _errorHandler = errorHandler;

    }

    /**
     * Takes in a raw COBOL source, potentially containing sequence numbers or
     * non data description statements and produces a clean source code.
     * 

     * Statements which are not data descriptions become empty lines in order to
     * preserve original line numbering.
     * 
     * @param cobolSource the raw COBOL source
     * @return the source cleaned up
     * @throws CleanerException if source cannot be read
     */
    public String clean(final String cobolSource) throws CleanerException {
        if (cobolSource != null) {
            BufferedReader reader = new BufferedReader(new StringReader(
                    cobolSource));
            String line;
            StringBuilder cleanedSource = new StringBuilder();
            CleaningContext context = new CleaningContext();
            try {
                while ((line = reader.readLine()) != null) {
                    if (isLineOfCode(line) && isDataDivision(line, context)) {
                        cleanedSource.append(removeExtraneousCharacters(
                                cleanLine(line), context));
                    }
                    cleanedSource.append("\n");
                }
                if (cleanedSource.length() <= "\n".length()) {
                    throw new CleanerException(
                            "No data descriptions found. Are you sure this is COBOL source?");
                }
                return cleanedSource.toString();
            } catch (IOException e) {
                throw new CleanerException(e);
            }
        } else {
            throw new CleanerException("COBOL source was null");
        }
    }

    /**
     * Make sure this is a line worth parsing. Ignore empty lines, comments and
     * compiler directives.
     * 
     * @param line the line to parse
     * @return true if this is not an empty or comment line
     */
    public boolean isLineOfCode(final String line) {
        if (line.length() < getIndicatorAreaPos() + 1) {
            return false;
        }

        /* Remove white space lines */
        if (line.trim().length() == 0) {
            return false;
        }

        /* Remove comments and special lines */
        if (isComment(line)) {
            return false;
        }

        /*
         * If there is a single token on this line, make sure it is not a
         * compiler directive.
         */
        String[] tokens = line.trim().split("[\\s\\.]+");
        if (tokens.length == 1
                && COMPILER_DIRECTIVES.contains(tokens[0].toUpperCase(Locale
                        .getDefault()))) {
            return false;
        }

        return true;
    }

    /**
     * Remove characters that should not be passed to the lexer.
     * 
     * @param line before cleaning
     * @return a cleaner line of code
     */
    public String cleanLine(final String line) {

        String cleanedLine = extendedCleanLine(line);

        /* Right trim, no need to over burden the lexer with spaces */
        cleanedLine = ("a" + cleanedLine).trim().substring(1);
        return cleanedLine;
    }

    /**
     * Specialized cleaners determine if this line is a comment.
     * 
     * @param line the line to check
     * @return true if this line is a comment
     */
    public abstract boolean isComment(final String line);

    /**
     * Derived classes can extend this method to further clean a line of code.
     * 
     * @param line the current line of code
     * @return a cleaner line of code
     */
    public String extendedCleanLine(final String line) {
        return line;
    }

    /**
     * Replace token separators such as ", " and "; " which complicate matters
     * uselessly. Replacement should not change column numbers though so we
     * simply replace the extra separators with a whitespace.
     * 
     * @param str a string containing long separators
     * @return the same string where long sperators have been replaced with
     *         white spaces
     */
    protected String replaceLongSeparators(String str) {
        return str.replace(", ", "  ").replace("; ", "  ");
    }

    /**
     * Rough triage of statements which are not strictly part of the data
     * division. Detects end of DATA DIVISION by looking for PROCEDURE DIVISION.
     * 

     * Since we are not guaranteed to have identification division, we initially
     * consider we are in the data division. If we find an identification
     * division, then we stop processing till we find a data division.
     * 
     * @param line the line to set data description status from
     * @param context the data description detection context
     * @return true if we are within the data division
     */
    public boolean isDataDivision(final String line,
            final CleaningContext context) {
        if (context.isDataDivision()) {
            Matcher matcher = IDENTIFICATION_DIVISION.matcher(line);
            if (matcher.find()) {
                context.setDataDivision(false);
                emitErrorMessage("Found identification division in ["
                        + line.trim() + "]. Lines ignored till data division.");
            }
            matcher = PROCEDURE_DIVISION.matcher(line);
            if (matcher.find()) {
                context.setDataDivision(false);
                emitErrorMessage("Found procedure division in [" + line.trim()
                        + "]. Remaining lines ignored.");
            }
        } else {
            Matcher matcher = DATA_DIVISION.matcher(line);
            if (matcher.find()) {
                context.setDataDivision(true);
                emitErrorMessage("Found data division in [" + line.trim()
                        + "]. Started looking for data items.");
            }
        }
        return context.isDataDivision();
    }

    /**
     * @return the zero-based position of the indicator area
     */
    public abstract int getIndicatorAreaPos();

    /**
     * Removes characters which are not part of a data description entry.
     * 

     * The fragment received as a parameter is assumed to be cleaned from
     * sequence numbers.
     * 

     * Data description entries start with an integer (the level) and end with a
     * period followed by either space, newline or EOF.
     * 

     * A single line might hold multiple data descriptions. This method is
     * recursive, and is called multiple times for each line fragment holding a
     * new data description.
     * 

     * Data description entries might span multiple lines which is why we need
     * to keep a context. Context tells us if we need to start by looking for a
     * level (no data description has started on some previous line) or for a
     * period.
     * 

     * Unsupported data description instructions such as COPY might appear on
     * the same line as data instructions. They also can span multiple lines.
     * This code blanks out such "non data description" statements.
     * 
     * Code that is outside alphanumeric literals is also cleaned from long
     * separators.
     * 
     * @param fragment a fragment of a line which might hold a data description
     * @param context the data description detection context
     * @return a line holding only data description parts or blank
     */
    public String removeExtraneousCharacters(final String fragment,
            final CleaningContext context) {
        if (fragment == null || fragment.length() == 0) {
            return fragment;
        }
        Matcher matcher;
        Matcher alphanumLiteralMatcher;
        StringBuilder cleanedLine = new StringBuilder();
        if (context.isLookingForLevel()) {
            matcher = DATA_DESCRIPTION_START.matcher(fragment);
            if (matcher.find()) {

                /*
                 * if the level does not start on the first character, the regex
                 * starts on the space or period character that precedes the
                 * level.
                 */
                int start = (matcher.start() > 0) ? matcher.start() + 1
                        : matcher.start();

                int endClean = start;

                if (start > 0) {
                    /*
                     * If there are non blank characters before the level, make
                     * sure they are period terminated otherwise assume this is
                     * not a level but more likely an argument for a keyword. In
                     * this last case, we need to clean the argument as well as
                     * all the previous characters.
                     */
                    if (isArgument(fragment.substring(0, start))) {
                        endClean = matcher.end() - 1;
                    }

                    /*
                     * Warn that we are about to get rid of these extra
                     * characters.
                     */
                    String extraneous = fragment.substring(0, endClean).trim();
                    if (extraneous.length() > 0) {
                        emitErrorMessage("Extraneous characters ignored: "
                                + extraneous);
                    }
                }

                /* Any extraneous character is replaced with spaces. */
                for (int i = 0; i < endClean; i++) {
                    cleanedLine.append(' ');
                }

                /*
                 * If we actually found a level, keep it and start looking for a
                 * delimiter.
                 */
                if (endClean == start) {
                    cleanedLine.append(fragment.substring(start,
                            matcher.end() - 1));
                    context.setLookingForLevel(false);
                }

                cleanedLine.append(removeExtraneousCharacters(
                        fragment.substring(matcher.end() - 1), context));
            } else {
                if (fragment.trim().length() > 0) {
                    emitErrorMessage("Extraneous characters ignored: "
                            + fragment);
                }
            }
        } else if (context.isAlphanumStarted()) {
            Pattern alphanumLiteralEnd = Pattern.compile("\\"
                    + context.getAlphanumDelimiter() + "($|\\s|,|;|\\"
                    + COBOL_DELIMITER + ")");
            alphanumLiteralMatcher = alphanumLiteralEnd.matcher(fragment);
            if (alphanumLiteralMatcher.find()) {
                cleanedLine.append(fragment.substring(0,
                        alphanumLiteralMatcher.end() - 1));
                context.setAlphanumStarted(false);
                cleanedLine.append(removeExtraneousCharacters(
                        fragment.substring(alphanumLiteralMatcher.end() - 1),
                        context));
            } else {
                cleanedLine.append(fragment);
            }
        } else {
            alphanumLiteralMatcher = ALPHANUM_LITERAL_START.matcher(fragment);
            matcher = DATA_DESCRIPTION_END.matcher(fragment);
            if (alphanumLiteralMatcher.find()) {
                if (matcher.find()) {
                    if (matcher.end() < alphanumLiteralMatcher.end()) {
                        cleanedLine.append(replaceLongSeparators(fragment
                                .substring(0, matcher.end())));
                        context.setLookingForLevel(true);
                        cleanedLine.append(removeExtraneousCharacters(
                                fragment.substring(matcher.end()), context));
                    } else {
                        cleanedLine.append(replaceLongSeparators(fragment
                                .substring(0, alphanumLiteralMatcher.end())));
                        context.setAlphanumDelimiter(fragment.substring(
                                alphanumLiteralMatcher.end() - 1).charAt(0));
                        context.setAlphanumStarted(true);
                        cleanedLine.append(removeExtraneousCharacters(fragment
                                .substring(alphanumLiteralMatcher.end()),
                                context));
                    }
                } else {
                    cleanedLine.append(replaceLongSeparators(fragment
                            .substring(0, alphanumLiteralMatcher.end())));
                    context.setAlphanumDelimiter(fragment.substring(
                            alphanumLiteralMatcher.end() - 1).charAt(0));
                    context.setAlphanumStarted(true);
                    cleanedLine.append(removeExtraneousCharacters(
                            fragment.substring(alphanumLiteralMatcher.end()),
                            context));
                }
            } else {
                if (matcher.find()) {
                    cleanedLine.append(replaceLongSeparators(fragment
                            .substring(0, matcher.end())));
                    context.setLookingForLevel(true);
                    cleanedLine.append(removeExtraneousCharacters(
                            fragment.substring(matcher.end()), context));
                } else {
                    cleanedLine.append(fragment);
                }

            }
        }
        return cleanedLine.toString();
    }

    /**
     * Describes the cleaning context. Because data description sentences can be
     * multiline or because it does not make sense to look for data description
     * entries once we past a PROCEDURE DIVISION section, we need to keep track
     * of the context.
     * 
     */
    public static class CleaningContext {

        /**
         * True when we are looking for a level (start of a data description
         * entry).
         */
        private boolean _lookingForLevel = true;

        /** True if we are likely to be in a COBOL DATA DIVISION section. */
        private boolean _inDataDivision = true;

        /**
         * Will be true when an alphanumeric delimiter is found and not yet
         * closed.
         */
        private boolean _alphanumStarted;

        /** When an alphanumeric is started this is the delimiter character. */
        private char _alphanumDelimiter;

        /**
         * @return true when we are looking for a level
         */
        public boolean isLookingForLevel() {
            return _lookingForLevel;
        }

        /**
         * @param isLookingForLevel set to true when we are looking for a level
         *            (start of a data description entry)
         */
        public void setLookingForLevel(final boolean isLookingForLevel) {
            _lookingForLevel = isLookingForLevel;
        }

        /**
         * @return true if we are likely to be in a COBOL DATA DIVISION section
         */
        public boolean isDataDivision() {
            return _inDataDivision;
        }

        /**
         * @param dataDivision set to true if we are likely to be in a COBOL
         *            DATA DIVISION section
         */
        public void setDataDivision(final boolean dataDivision) {
            _inDataDivision = dataDivision;
        }

        /**
         * @return true when an alphanumeric delimiter is found and not yet
         *         closed
         */
        public boolean isAlphanumStarted() {
            return _alphanumStarted;
        }

        /**
         * @param alphanumStarted true when an alphanumeric delimiter is found
         *            and not yet closed
         */
        public void setAlphanumStarted(boolean alphanumStarted) {
            this._alphanumStarted = alphanumStarted;
        }

        /**
         * @return when an alphanumeric is started this is the delimiter
         *         character
         */
        public char getAlphanumDelimiter() {
            return _alphanumDelimiter;
        }

        /**
         * @param alphanumDelimiter when an alphanumeric is started this is the
         *            delimiter character
         */
        public void setAlphanumDelimiter(char alphanumDelimiter) {
            this._alphanumDelimiter = alphanumDelimiter;
        }

    }

    /**
     * Examine characters before an assumed level. If these characters are not
     * terminated by a COBOL delimiter then the level is actually an argument to
     * a previous keyword, not an actual level.
     * 
     * @param fragment a fragment of code preceding an assumed level
     * @return true if the assumed level is an argument
     */
    protected boolean isArgument(final String fragment) {
        String s = fragment.trim();
        if (s.length() > 0) {
            return s.charAt(s.length() - 1) != COBOL_DELIMITER;
        }
        return false;
    }

    /**
     * Add an error message to the history.
     * 
     * @param msg the error message
     * */
    public void emitErrorMessage(final String msg) {
        getErrorHandler().addMessageToHistory(msg);
    }

    /**
     * @return the error messages handler
     */
    public RecognizerErrorHandler getErrorHandler() {
        return _errorHandler;
    }
}