com.legstar.cobol.AbstractCobolSourceCleaner Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010 LegSem.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the GNU Lesser Public License v2.1
* which accompanies this distribution, and is available at
* http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
*
* Contributors:
* LegSem - initial API and implementation
******************************************************************************/
package com.legstar.cobol;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.legstar.antlr.CleanerException;
/**
* In order to reduce the lexer/parser grammar complexity, this class will
* remove all unnecessary characters from the original source. This way, the
* ANTLR lexer will be presented with a purified source that only contains data
* division entries.
*
* This allows users to submit complete COBOL programs or fragments of COBOL
* programs with non data description statements to the parser without the need
* to add grammar rules for all these cases.
*
*/
public abstract class AbstractCobolSourceCleaner {
/** Cobol sentences are assumed to be terminated by this character. */
public static final char COBOL_DELIMITER = '.';
/** Pattern that recognizes the start of a data description entry. */
public static final Pattern DATA_DESCRIPTION_START = Pattern
.compile("(^|\\s|\\" + COBOL_DELIMITER + ")\\d(\\d)?(\\s|\\"
+ COBOL_DELIMITER + "|$)");
/** Pattern that recognizes the end of a data description entry. */
public static final Pattern DATA_DESCRIPTION_END = Pattern.compile("(\\"
+ COBOL_DELIMITER + "$)|(\\" + COBOL_DELIMITER + "\\s)");
/** Pattern that recognizes the start of an alphanumeric literal. */
public static final Pattern ALPHANUM_LITERAL_START = Pattern
.compile("(^|\\s)[\\\"\']");
/** Pattern that recognizes the start of a procedure division. */
public static final Pattern PROCEDURE_DIVISION = Pattern.compile(
"^(\\s)*PROCEDURE DIVISION", Pattern.CASE_INSENSITIVE);
/** Pattern that recognizes the start of an identification division. */
public static final Pattern IDENTIFICATION_DIVISION = Pattern.compile(
"^(\\s)*ID(ENTIFICATION)? DIVISION", Pattern.CASE_INSENSITIVE);
/** Pattern that recognizes the start of a data division. */
public static final Pattern DATA_DIVISION = Pattern.compile(
"^(\\s)*DATA DIVISION", Pattern.CASE_INSENSITIVE);
/**
* List of compiler directives (they can be period delimited but are
* guaranteed to be alone on a line).
*/
public static final List < String > COMPILER_DIRECTIVES = Arrays.asList(
"EJECT", "SKIP", "SKIP1", "SKIP2", "SKIP3");
/** Handles error messages. */
private RecognizerErrorHandler _errorHandler;
/**
* Construct with a shared error handler.
*
* @param errorHandler handles error messages
*/
public AbstractCobolSourceCleaner(final RecognizerErrorHandler errorHandler) {
_errorHandler = errorHandler;
}
/**
* Takes in a raw COBOL source, potentially containing sequence numbers or
* non data description statements and produces a clean source code.
*
* Statements which are not data descriptions become empty lines in order to
* preserve original line numbering.
*
* @param cobolSource the raw COBOL source
* @return the source cleaned up
* @throws CleanerException if source cannot be read
*/
public String clean(final String cobolSource) throws CleanerException {
if (cobolSource != null) {
BufferedReader reader = new BufferedReader(new StringReader(
cobolSource));
String line;
StringBuilder cleanedSource = new StringBuilder();
CleaningContext context = new CleaningContext();
try {
while ((line = reader.readLine()) != null) {
if (isLineOfCode(line) && isDataDivision(line, context)) {
cleanedSource.append(removeExtraneousCharacters(
cleanLine(line), context));
}
cleanedSource.append("\n");
}
if (cleanedSource.length() <= "\n".length()) {
throw new CleanerException(
"No data descriptions found. Are you sure this is COBOL source?");
}
return cleanedSource.toString();
} catch (IOException e) {
throw new CleanerException(e);
}
} else {
throw new CleanerException("COBOL source was null");
}
}
/**
* Make sure this is a line worth parsing. Ignore empty lines, comments and
* compiler directives.
*
* @param line the line to parse
* @return true if this is not an empty or comment line
*/
public boolean isLineOfCode(final String line) {
if (line.length() < getIndicatorAreaPos() + 1) {
return false;
}
/* Remove white space lines */
if (line.trim().length() == 0) {
return false;
}
/* Remove comments and special lines */
if (isComment(line)) {
return false;
}
/*
* If there is a single token on this line, make sure it is not a
* compiler directive.
*/
String[] tokens = line.trim().split("[\\s\\.]+");
if (tokens.length == 1
&& COMPILER_DIRECTIVES.contains(tokens[0].toUpperCase(Locale
.getDefault()))) {
return false;
}
return true;
}
/**
* Remove characters that should not be passed to the lexer.
*
* @param line before cleaning
* @return a cleaner line of code
*/
public String cleanLine(final String line) {
String cleanedLine = extendedCleanLine(line);
/* Right trim, no need to over burden the lexer with spaces */
cleanedLine = ("a" + cleanedLine).trim().substring(1);
return cleanedLine;
}
/**
* Specialized cleaners determine if this line is a comment.
*
* @param line the line to check
* @return true if this line is a comment
*/
public abstract boolean isComment(final String line);
/**
* Derived classes can extend this method to further clean a line of code.
*
* @param line the current line of code
* @return a cleaner line of code
*/
public String extendedCleanLine(final String line) {
return line;
}
/**
* Replace token separators such as ", " and "; " which complicate matters
* uselessly. Replacement should not change column numbers though so we
* simply replace the extra separators with a whitespace.
*
* @param str a string containing long separators
* @return the same string where long sperators have been replaced with
* white spaces
*/
protected String replaceLongSeparators(String str) {
return str.replace(", ", " ").replace("; ", " ");
}
/**
* Rough triage of statements which are not strictly part of the data
* division. Detects end of DATA DIVISION by looking for PROCEDURE DIVISION.
*
* Since we are not guaranteed to have identification division, we initially
* consider we are in the data division. If we find an identification
* division, then we stop processing till we find a data division.
*
* @param line the line to set data description status from
* @param context the data description detection context
* @return true if we are within the data division
*/
public boolean isDataDivision(final String line,
final CleaningContext context) {
if (context.isDataDivision()) {
Matcher matcher = IDENTIFICATION_DIVISION.matcher(line);
if (matcher.find()) {
context.setDataDivision(false);
emitErrorMessage("Found identification division in ["
+ line.trim() + "]. Lines ignored till data division.");
}
matcher = PROCEDURE_DIVISION.matcher(line);
if (matcher.find()) {
context.setDataDivision(false);
emitErrorMessage("Found procedure division in [" + line.trim()
+ "]. Remaining lines ignored.");
}
} else {
Matcher matcher = DATA_DIVISION.matcher(line);
if (matcher.find()) {
context.setDataDivision(true);
emitErrorMessage("Found data division in [" + line.trim()
+ "]. Started looking for data items.");
}
}
return context.isDataDivision();
}
/**
* @return the zero-based position of the indicator area
*/
public abstract int getIndicatorAreaPos();
/**
* Removes characters which are not part of a data description entry.
*
* The fragment received as a parameter is assumed to be cleaned from
* sequence numbers.
*
* Data description entries start with an integer (the level) and end with a
* period followed by either space, newline or EOF.
*
* A single line might hold multiple data descriptions. This method is
* recursive, and is called multiple times for each line fragment holding a
* new data description.
*
* Data description entries might span multiple lines which is why we need
* to keep a context. Context tells us if we need to start by looking for a
* level (no data description has started on some previous line) or for a
* period.
*
* Unsupported data description instructions such as COPY might appear on
* the same line as data instructions. They also can span multiple lines.
* This code blanks out such "non data description" statements.
*
* Code that is outside alphanumeric literals is also cleaned from long
* separators.
*
* @param fragment a fragment of a line which might hold a data description
* @param context the data description detection context
* @return a line holding only data description parts or blank
*/
public String removeExtraneousCharacters(final String fragment,
final CleaningContext context) {
if (fragment == null || fragment.length() == 0) {
return fragment;
}
Matcher matcher;
Matcher alphanumLiteralMatcher;
StringBuilder cleanedLine = new StringBuilder();
if (context.isLookingForLevel()) {
matcher = DATA_DESCRIPTION_START.matcher(fragment);
if (matcher.find()) {
/*
* if the level does not start on the first character, the regex
* starts on the space or period character that precedes the
* level.
*/
int start = (matcher.start() > 0) ? matcher.start() + 1
: matcher.start();
int endClean = start;
if (start > 0) {
/*
* If there are non blank characters before the level, make
* sure they are period terminated otherwise assume this is
* not a level but more likely an argument for a keyword. In
* this last case, we need to clean the argument as well as
* all the previous characters.
*/
if (isArgument(fragment.substring(0, start))) {
endClean = matcher.end() - 1;
}
/*
* Warn that we are about to get rid of these extra
* characters.
*/
String extraneous = fragment.substring(0, endClean).trim();
if (extraneous.length() > 0) {
emitErrorMessage("Extraneous characters ignored: "
+ extraneous);
}
}
/* Any extraneous character is replaced with spaces. */
for (int i = 0; i < endClean; i++) {
cleanedLine.append(' ');
}
/*
* If we actually found a level, keep it and start looking for a
* delimiter.
*/
if (endClean == start) {
cleanedLine.append(fragment.substring(start,
matcher.end() - 1));
context.setLookingForLevel(false);
}
cleanedLine.append(removeExtraneousCharacters(
fragment.substring(matcher.end() - 1), context));
} else {
if (fragment.trim().length() > 0) {
emitErrorMessage("Extraneous characters ignored: "
+ fragment);
}
}
} else if (context.isAlphanumStarted()) {
Pattern alphanumLiteralEnd = Pattern.compile("\\"
+ context.getAlphanumDelimiter() + "($|\\s|,|;|\\"
+ COBOL_DELIMITER + ")");
alphanumLiteralMatcher = alphanumLiteralEnd.matcher(fragment);
if (alphanumLiteralMatcher.find()) {
cleanedLine.append(fragment.substring(0,
alphanumLiteralMatcher.end() - 1));
context.setAlphanumStarted(false);
cleanedLine.append(removeExtraneousCharacters(
fragment.substring(alphanumLiteralMatcher.end() - 1),
context));
} else {
cleanedLine.append(fragment);
}
} else {
alphanumLiteralMatcher = ALPHANUM_LITERAL_START.matcher(fragment);
matcher = DATA_DESCRIPTION_END.matcher(fragment);
if (alphanumLiteralMatcher.find()) {
if (matcher.find()) {
if (matcher.end() < alphanumLiteralMatcher.end()) {
cleanedLine.append(replaceLongSeparators(fragment
.substring(0, matcher.end())));
context.setLookingForLevel(true);
cleanedLine.append(removeExtraneousCharacters(
fragment.substring(matcher.end()), context));
} else {
cleanedLine.append(replaceLongSeparators(fragment
.substring(0, alphanumLiteralMatcher.end())));
context.setAlphanumDelimiter(fragment.substring(
alphanumLiteralMatcher.end() - 1).charAt(0));
context.setAlphanumStarted(true);
cleanedLine.append(removeExtraneousCharacters(fragment
.substring(alphanumLiteralMatcher.end()),
context));
}
} else {
cleanedLine.append(replaceLongSeparators(fragment
.substring(0, alphanumLiteralMatcher.end())));
context.setAlphanumDelimiter(fragment.substring(
alphanumLiteralMatcher.end() - 1).charAt(0));
context.setAlphanumStarted(true);
cleanedLine.append(removeExtraneousCharacters(
fragment.substring(alphanumLiteralMatcher.end()),
context));
}
} else {
if (matcher.find()) {
cleanedLine.append(replaceLongSeparators(fragment
.substring(0, matcher.end())));
context.setLookingForLevel(true);
cleanedLine.append(removeExtraneousCharacters(
fragment.substring(matcher.end()), context));
} else {
cleanedLine.append(fragment);
}
}
}
return cleanedLine.toString();
}
/**
* Describes the cleaning context. Because data description sentences can be
* multiline or because it does not make sense to look for data description
* entries once we past a PROCEDURE DIVISION section, we need to keep track
* of the context.
*
*/
public static class CleaningContext {
/**
* True when we are looking for a level (start of a data description
* entry).
*/
private boolean _lookingForLevel = true;
/** True if we are likely to be in a COBOL DATA DIVISION section. */
private boolean _inDataDivision = true;
/**
* Will be true when an alphanumeric delimiter is found and not yet
* closed.
*/
private boolean _alphanumStarted;
/** When an alphanumeric is started this is the delimiter character. */
private char _alphanumDelimiter;
/**
* @return true when we are looking for a level
*/
public boolean isLookingForLevel() {
return _lookingForLevel;
}
/**
* @param isLookingForLevel set to true when we are looking for a level
* (start of a data description entry)
*/
public void setLookingForLevel(final boolean isLookingForLevel) {
_lookingForLevel = isLookingForLevel;
}
/**
* @return true if we are likely to be in a COBOL DATA DIVISION section
*/
public boolean isDataDivision() {
return _inDataDivision;
}
/**
* @param dataDivision set to true if we are likely to be in a COBOL
* DATA DIVISION section
*/
public void setDataDivision(final boolean dataDivision) {
_inDataDivision = dataDivision;
}
/**
* @return true when an alphanumeric delimiter is found and not yet
* closed
*/
public boolean isAlphanumStarted() {
return _alphanumStarted;
}
/**
* @param alphanumStarted true when an alphanumeric delimiter is found
* and not yet closed
*/
public void setAlphanumStarted(boolean alphanumStarted) {
this._alphanumStarted = alphanumStarted;
}
/**
* @return when an alphanumeric is started this is the delimiter
* character
*/
public char getAlphanumDelimiter() {
return _alphanumDelimiter;
}
/**
* @param alphanumDelimiter when an alphanumeric is started this is the
* delimiter character
*/
public void setAlphanumDelimiter(char alphanumDelimiter) {
this._alphanumDelimiter = alphanumDelimiter;
}
}
/**
* Examine characters before an assumed level. If these characters are not
* terminated by a COBOL delimiter then the level is actually an argument to
* a previous keyword, not an actual level.
*
* @param fragment a fragment of code preceding an assumed level
* @return true if the assumed level is an argument
*/
protected boolean isArgument(final String fragment) {
String s = fragment.trim();
if (s.length() > 0) {
return s.charAt(s.length() - 1) != COBOL_DELIMITER;
}
return false;
}
/**
* Add an error message to the history.
*
* @param msg the error message
* */
public void emitErrorMessage(final String msg) {
getErrorHandler().addMessageToHistory(msg);
}
/**
* @return the error messages handler
*/
public RecognizerErrorHandler getErrorHandler() {
return _errorHandler;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy