All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.okapi.steps.cleanup.Cleaner Maven / Gradle / Ivy

/*===========================================================================
  Copyright (C) 2013 by the Okapi Framework contributors
-----------------------------------------------------------------------------
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
===========================================================================*/

package net.sf.okapi.steps.cleanup;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Cleaner {

    private final Logger LOGGER = LoggerFactory.getLogger(getClass());
    // TODO: organize these strings better. Dynamically set on instantiation
    // thru private method?
    private static String OPENING_QUOTES_W_SPACE = "([\u00AB\u2039])([\\s\u00A0]+)";
    private static String CLOSING_QUOTES_W_SPACE = "([\\s\u00A0]+)([\u00BB\u203A])";
    private static String DOUBLE_QUOTES = "\u201C|\u201D|\u201E|\u201F|\u00AB|\u00BB";
    private static String DQ_REPLACE = "\"";
    private static String SINGLE_QUOTES = "\u2018|\u2019|\u201A|\u2039|\u203A";
    private static String SQ_REPLACE = "\u0027";
    private static String SPECIALPUNC = "\"\u0027";
    private static final String SINGLEQUOTES = "\'\u2018\u2019\u201A\u201B\u2039\u203A";
    private static final String DOUBLEQUOTES = "\"\u201C\u201D\u201E\u201F\u00AB\u00BB";
    private static final String PUNCTUATION = ".,;:!\u00A1?\u00BF";
    private static final String OPENINGQUOTES = "\u2018\u201A\u2039\u201C\u201E\u00AB";
    private static final String CLOSINGQUOTES = "\u2019\u201B\u203A\u201D\u201F\u00BB";
    private static final String MARKS = "\'\u2018\u2019\u201A\u201B\u2039\u203A\"\u201C\u201D\u201E\u201F\u00AB\u00BB.,;:!\u00A1?\u00BF";
    private static final String QUOTES = "\'\u2018\u2019\u201A\u201B\u2039\u203A\"\u201C\u201D\u201E\u201F\u00AB\u00BB";
    private Parameters params;

    /**
     * Creates a Cleaner object with default options.
     */
    public Cleaner() {

        this(null);
    }

    /**
     * Creates a Cleaner object with a given set of options.
     *
     * @param params the options to assign to this object (use null for the
     * defaults).
     */
    public Cleaner(Parameters params) {

        this.params = (params == null ? new Parameters() : params);
    }

    /**
     * Performs the cleaning of the text unit according to user selected
     * options.
     *
     * @param tu the unit containing the text to clean
     * @param targetLocale
     * @return true if tu should be discarded and false otherwise
     */
    public boolean run(ITextUnit tu, LocaleId targetLocale) {

        if (!tu.isEmpty()) {
            ISegments srcSegs = tu.getSourceSegments();
            for (Segment srcSeg : srcSegs) {
                Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);

                // TODO: test trgSeg == null here else continue;
                if (trgSeg == null) {
                    continue;
                }

                // normalized whitespace.
                // all subsequent steps assume only single spaces
                normalizeWhitespace(tu, srcSeg, targetLocale);

                // clean with selected options
                if (params.getNormalizeQuotes()) {
                    normalizeQuotation(tu, srcSeg, targetLocale);
                }
                if (params.getCheckCharacters()) {
                    checkCharacters(tu, srcSeg, targetLocale);

                }
                if (params.getMatchRegexExpressions()) {
                    matchRegexExpressions(tu, srcSeg, targetLocale);
                }

            }
        }

        // return false iff tu has text, else remove tu
        if (pruneTextUnit(tu, targetLocale) == true) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Converts whitespace ({tab}, {space}, {CR}, {LF}) to single space.
     *
     * @param tu: the TextUnit containing the segments to update
     * @param seg: the Segment to update
     * @param targetLocale: the language for which the text should be updated
     */
    protected void normalizeWhitespace(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        TextFragment.unwrap(seg.getContent());
        TextFragment.unwrap(tu.getTargetSegment(targetLocale, seg.getId(), false).getContent());
    }

    /**
     * Converts all quotation marks (curly or language specific) to straight
     * quotes. All apostrophes will also be converted to their straight
     * equivalents.
     *
     * @param srcFrag: original text to be normalized
     * @param trgFrag: target text to be normalized
     * @param targetLocale the language for which the text should be updated
     */
    protected void normalizeQuotation(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        TextFragment trgFragment = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        String srcText = seg.getContent().getCodedText();
        String trgText = trgFragment.getCodedText();

        Pattern pattern;
        Matcher matcher;

        // update quote spacing
        pattern = Pattern.compile(OPENING_QUOTES_W_SPACE);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll("$1");
        pattern = Pattern.compile(CLOSING_QUOTES_W_SPACE);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll("$2");

        // update source quotes
        pattern = Pattern.compile(DOUBLE_QUOTES);
        matcher = pattern.matcher(srcText);
        srcText = matcher.replaceAll(DQ_REPLACE).toString();
        pattern = Pattern.compile(SINGLE_QUOTES);
        matcher = pattern.matcher(srcText);
        srcText = matcher.replaceAll(SQ_REPLACE).toString();

        // update target quotes
        pattern = Pattern.compile(DOUBLE_QUOTES);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll(DQ_REPLACE).toString();
        pattern = Pattern.compile(SINGLE_QUOTES);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll(SQ_REPLACE).toString();

        // save updated strings
        seg.getContent().setCodedText(srcText);
        trgFragment.setCodedText(trgText);
    }

    protected void normalizeMarks(ITextUnit tu, Segment seg, LocaleId targetLocale) {
//
////		TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
//        StringBuilder srcText = new StringBuilder(seg.text);
////		StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());
//
//        int cursor = 0;
//        int proxyCheck = 0;
//        int proximityThreshold = 2;
//        int ch = 0;
//        int quoteChBefore = 0;
//        int quoteChAfter = 0;
//        int puncCh = 0;
//        int nearQuote = -1;
//        int nearPunc = -1;
//
//        boolean doQuotes = params.getNormalizeQuotes();
//        boolean isPunctuation = false;
//        boolean isQuote = false;
//        boolean isNearQuote = false;
//        boolean isBeforeQuote = false;
//        boolean isAfterQuote = false;
//        boolean isNearPuncuation = false;
//
//        // normalize source
//        while (cursor <= srcText.length() - 1) {
//            ch = srcText.charAt(cursor);
//            char c = (char) ch;
//            if (MARKS.indexOf(ch) != -1) {
//                // Punctuation
//                if (PUNCTUATION.indexOf(ch) != -1) {
//                    if (nearQuote >= -1) {
//                        // check proximity backwards
//                        proxyCheck = cursor;
//                        while ((proxyCheck > 0) && ((cursor - proxyCheck) < proximityThreshold) && (isAfterQuote == false)) {
//                            proxyCheck -= 1;
//                            if (QUOTES.indexOf(srcText.charAt(proxyCheck)) != -1) {
//                                nearQuote = proxyCheck;
//                                isNearQuote = true;
//                                isAfterQuote = true;
//                                quoteChAfter = srcText.charAt(proxyCheck);
//                                break;
//                            }
//                        }
//                        // check proximity forwards
//                        proxyCheck = cursor;
//                        while ((proxyCheck < srcText.length() - 1) && ((proxyCheck - cursor) < proximityThreshold) && (isBeforeQuote == false)) {
//                            proxyCheck += 1;
//                            if (QUOTES.indexOf(srcText.charAt(proxyCheck)) != -1) {
//                                nearQuote = proxyCheck;
//                                isNearQuote = true;
//                                isBeforeQuote = true;
//                                quoteChBefore = srcText.charAt(proxyCheck);
//                                break;
//                            }
//                        }
//                        // TODO: quotes are both back and forwards; "a " for example.
//
//                        // TODO: process PUNCTUATION based on rules and known attributes
//                        if (isNearQuote == false) {
//                            // treat PUNCTUATION according to rules
//                            switch (ch) {
//                                case '\u002E': // period "."
//                                    // characters after period
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // end of abbreviation
//                                        if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                        // end of a sentence
//                                        if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before period
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            // end of a sentence
//                                            if (cursor == srcText.length() - 1) {
//                                                srcText.deleteCharAt(cursor - 1);
//                                                cursor -= 1;
//                                            } else {
//                                                // period is a decimal marker
//                                                if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                    srcText.deleteCharAt(cursor - 1);
//                                                    cursor -= 1;
//                                                }
//                                            }
//                                        }
//                                    }
//                                    break;
//                                case '\u002C': // comma ","
//                                    // characters after comma
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // normal instance
//                                        if (Character.isLetterOrDigit(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before comma
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            // comma is a decimal marker
//                                            if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                srcText.deleteCharAt(cursor - 1);
//                                                cursor -= 1;
//                                            }
//                                        }
//                                    }
//                                    break;
//                                case '\u003B': // semicolon ";"
//                                    // characters after colon
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // normal instance
//                                        if (Character.isLetterOrDigit(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before colon
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor - 1);
//                                            cursor -= 1;
//                                        }
//                                    }
//                                    break;
//                                case '\u003A': // colon ":"
//                                    // characters after colon
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // normal instance
//                                        if (Character.isLetterOrDigit(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before colon
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor - 1);
//                                            cursor -= 1;
//                                        }
//                                    }
//                                    break;
//                                case '\u0021': // exclamation mark "!"
//                                    // characters after colon
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // normal instance
//                                        if (Character.isLetterOrDigit(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before colon
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor - 1);
//                                            cursor -= 1;
//                                        }
//                                    }
//                                    break;
//                                case '\u00A1': // inverted exclamation mark "¡"
//                                    break;
//                                case '\u003F': // question mark "?"
//                                    // characters after colon
//                                    if (cursor < srcText.length() - 1) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor + 1);
//                                        }
//                                        // normal instance
//                                        if (Character.isLetterOrDigit(srcText.charAt(cursor + 1))) {
//                                            srcText.insert(cursor + 1, ' ');
//                                        }
//                                    }
//                                    // characters before colon
//                                    if (cursor > 0) {
//                                        if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                            srcText.deleteCharAt(cursor - 1);
//                                            cursor -= 1;
//                                        }
//                                    }
//                                    break;
//                                case '\u00BF': // inverted question mark "¿"
//                                    break;
//                                default:
//                                    // other punctuation marks are retained as-is
//                                    break;
//                            }
//                        } else {
//                            // TODO: exceptions to the rules
//                            switch (ch) {
//                                case '\u002E': // period "."
//                                    // ch is after quote
//                                    if (isAfterQuote == true) {
//                                        if (OPENINGQUOTES.indexOf(quoteChAfter) != -1) {
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        } else if (CLOSINGQUOTES.indexOf(quoteChAfter) != -1) {
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    // end of a sentence
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                        srcText.deleteCharAt(cursor);
//                                                        srcText.insert(cursor - 1, '.');
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                        // end of an embedded sentence
//                                                        if (cursor < srcText.length() - 1) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                            if (srcText.charAt(cursor + 1) != ' ') {
//                                                                srcText.insert(cursor + 1, ' ');
//                                                            }
//                                                            // move period inside quotes
//                                                            srcText.deleteCharAt(cursor);
//                                                            srcText.insert(cursor - 1, '.');
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        } else {
//                                            // default case
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    // end of a sentence
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                        srcText.deleteCharAt(cursor);
//                                                        srcText.insert(cursor - 1, '.');
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                        // end of an embedded sentence
//                                                        if (cursor < srcText.length() - 1) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                            if (srcText.charAt(cursor + 1) != ' ') {
//                                                                srcText.insert(cursor + 1, ' ');
//                                                            }
//                                                            // move period inside quotes
//                                                            srcText.deleteCharAt(cursor);
//                                                            srcText.insert(cursor - 1, '.');
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        }
//                                    } // end isAfterQuote
//
//                                    // ch is before quote
//                                    if (isBeforeQuote == true) {
//                                        if (OPENINGQUOTES.indexOf(quoteChBefore) != -1) {
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if (cursor > 3) {
//                                                    if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                        srcText.insert(cursor + 1, ' ');
//                                                    }
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                        // end of an embedded sentence
//                                                        if (cursor < srcText.length() - 1) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                            if (srcText.charAt(cursor + 1) != ' ') {
//                                                                srcText.insert(cursor + 1, ' ');
//                                                            }
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        } else if (CLOSINGQUOTES.indexOf(quoteChBefore) != -1) {
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    // end of a sentence
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                        srcText.deleteCharAt(cursor);
//                                                        srcText.insert(cursor - 1, '.');
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                        // end of an embedded sentence
//                                                        if (cursor < srcText.length() - 1) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                            if (srcText.charAt(cursor + 1) != ' ') {
//                                                                srcText.insert(cursor + 1, ' ');
//                                                            }
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        } else {
//                                            // default case
//                                            if (cursor < srcText.length() - 1) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor + 1))) || (srcText.charAt(cursor + 1) == '\u00A0')) {
//                                                    srcText.deleteCharAt(cursor + 1);
//                                                }
//                                                // end of abbreviation
//                                                if ((Pattern.compile("([a-zA-Z]{1,3}\\.[\\s ]?)+").matcher(srcText.subSequence(cursor - 4, cursor - 1)).find() == true) && (Character.isLowerCase(srcText.charAt(cursor + 1)))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                                // end of a sentence
//                                                if (Character.isUpperCase(srcText.charAt(cursor + 1))) {
//                                                    srcText.insert(cursor + 1, ' ');
//                                                }
//                                            }
//                                            if (cursor > 0) {
//                                                if ((Character.isWhitespace(srcText.charAt(cursor - 1))) || (srcText.charAt(cursor - 1) == '\u00A0')) {
//                                                    // end of a sentence
//                                                    if (cursor == srcText.length() - 1) {
//                                                        srcText.deleteCharAt(cursor - 1);
//                                                        cursor -= 1;
//                                                        srcText.deleteCharAt(cursor);
//                                                        srcText.insert(cursor - 1, '.');
//                                                    } else {
//                                                        // period is a decimal marker
//                                                        if ((cursor < srcText.length() - 1) && (!Character.isDigit(srcText.charAt(cursor - 1))) && (Character.isDigit(srcText.charAt(cursor + 1)))) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                        }
//                                                        // end of an embedded sentence
//                                                        if (cursor < srcText.length() - 1) {
//                                                            srcText.deleteCharAt(cursor - 1);
//                                                            cursor -= 1;
//                                                            if (srcText.charAt(cursor + 1) != ' ') {
//                                                                srcText.insert(cursor + 1, ' ');
//                                                            }
//                                                            // move period inside quotes
//                                                            srcText.deleteCharAt(cursor);
//                                                            srcText.insert(cursor - 1, '.');
//                                                        }
//                                                    }
//                                                }
//                                            }
//                                        }
//                                    } // end isBeforeQuote
//                                    break;
//                                case '\u002C': // comma ","
//                                    break;
//                                case '\u003B': // semicolon ";"
//                                    break;
//                                case '\u003A': // colon ":"
//                                    break;
//                                case '\u0021': // exclamation mark "!"
//                                    break;
//                                case '\u00A1': // inverted exclamation mark "¡"
//                                    break;
//                                case '\u003F': // question mark "?"
//                                    break;
//                                case '\u00BF': // inverted question mark "¿"
//                                    break;
//                                default:
//                                    break;
//                            }
//                        }
//                    }
//                } else { // end PUNCTUATION
//                    // Quotation
//                    if (nearPunc >= -1) {
//                        // check proximity backwards
//                        proxyCheck = cursor;
//                        while ((proxyCheck > 0) && ((cursor - proxyCheck) < proximityThreshold) && (isNearPuncuation == false)) {
//                            proxyCheck -= 1;
//                            if (PUNCTUATION.indexOf(srcText.charAt(proxyCheck)) != -1) {
//                                nearPunc = proxyCheck;
//                                isNearPuncuation = true;
//                                puncCh = srcText.charAt(proxyCheck);
//                                break;
//                            }
//                        }
//                        // check proximity forwards
//                        proxyCheck = cursor;
//                        while ((proxyCheck < srcText.length() - 1) && ((proxyCheck - cursor) < proximityThreshold) && (isNearPuncuation == false)) {
//                            proxyCheck += 1;
//                            if (PUNCTUATION.indexOf(srcText.charAt(proxyCheck)) != -1) {
//                                nearPunc = proxyCheck;
//                                isNearPuncuation = true;
//                                puncCh = srcText.charAt(proxyCheck);
//                                break;
//                            }
//                        }
//                        // TODO: punctuation is both back and forwards.
//
//                        // TODO: process QUOTES based on rules and known attributes
//                        if (isNearPuncuation == false) {
//                            if (doQuotes == true) {
//                                // TODO: treat QUOTES according to rules
//                            }
//                        } else {
//                            if (doQuotes == true) {
//                                // TODO: exceptions to the rules
//                            }
//                        }
//                    }
//                } // end QUOTATION
//
//                // reset booleans
//                quoteChBefore = 0;
//                quoteChAfter = 0;
//                puncCh = 0;
//                nearQuote = -1;
//                nearPunc = -1;
//                isPunctuation = false;
//                isQuote = false;
//                isNearQuote = false;
//                isBeforeQuote = false;
//                isAfterQuote = false;
//                isNearPuncuation = false;
//
//            } // end MARKS
//
//            // iterate
//            cursor += 1;
//        }
//
//        // save updated strings
//        seg.getContent().setCodedText(srcText.toString());
    }

    /**
     * Attempts to make punctuation and spacing around punctuation consistent
     * according to standard English punctuation rules. Assumptions: 1) all
     * strings passed have consistent spacing (only single spaces) 2) quotes
     * have been normalized 3) strings will need post-processing in order to
     * correct spacing for languages such as French. This ignores locale and
     * Asian punctuation.
     *
     * @param srcFrag : original text to be normalized
     * @param trgFrag : target text to be normalized
     */
    protected void normalizePunctuation(TextFragment srcFrag, TextFragment trgFrag) {

        //TODO: update method to take the TU, Segment, Locale as parameters.

        StringBuilder srcText = new StringBuilder(srcFrag.getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());

        int cur = 0;
        int ch = 0;

        // normalize source
        while (cur <= srcText.length() - 1) {
            ch = srcText.charAt(cur);
            if (PUNCTUATION.indexOf(ch) != -1) {
                switch (ch) {
                    case '\u002E': // .
                        if (cur < srcText.length() - 1) {
                            if ((Character.isWhitespace(srcText.charAt(cur + 1)))
                                    || (srcText.charAt(cur + 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur + 1);
                            }
                        }
                        if (cur > 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                if (cur == srcText.length() - 1) {
                                    srcText.deleteCharAt(cur - 1);
                                    cur -= 1;
                                } else if ((cur < srcText.length() - 1)
                                        && (Character.isDigit(srcText.charAt(cur + 1)))) {
                                    srcText.deleteCharAt(cur - 1);
                                    cur -= 1;
                                } else {
                                    break;
                                }
                            }
                        }
                        break;
                    case '\u002C': // ,
                        if (cur > 0) {
                            if (cur < srcText.length() - 1) {
                                if (((Character.isWhitespace(srcText.charAt(cur - 1))) || (srcText.charAt(cur - 1) == '\u00A0'))
                                        && (!Character.isDigit(srcText.charAt(cur + 1)))) {
                                    srcText.deleteCharAt(cur - 1);
                                    cur -= 1;
                                }
                            }
                        }
                        if (cur < srcText.length() - 1) {
                            if (Character.isWhitespace(srcText.charAt(cur + 1))) {
                                srcText.deleteCharAt(cur + 1);
                            }
//                            if ((!Character.isWhitespace(srcText.charAt(cur + 1)))
//                                    && (srcText.charAt(cur + 1) != '\u00A0')
//                                    && (SPECIALPUNC.indexOf(srcText.charAt(cur + 1)) == -1)) {
//                                srcText.insert(cur + 1, ' ');
//                            }
                        char chB = srcText.charAt(cur - 1);
                        char chA = srcText.charAt(cur + 1);
                        boolean dig = Character.isDigit(srcText.charAt(cur + 1));
                        int sp = SPECIALPUNC.indexOf(srcText.charAt(cur + 1));
                        if ((!Character.isDigit(srcText.charAt(cur + 1))) && (SPECIALPUNC.indexOf(srcText.charAt(cur + 1)) == -1)) {
                            srcText.insert(cur + 1, ' ');
                        }
                }
                break;
            
        
        case '\u003B': // ;
                        if (cur > 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        if (cur < srcText.length() - 1) {
                            if ((!Character.isWhitespace(srcText.charAt(cur + 1)))
                                    && (srcText.charAt(cur + 1) != '\u00A0')) {
                                srcText.insert(cur + 1, ' ');
                            }
                        }
                        break;
                    case '\u003A': // :
                        if (cur > 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        if (cur < srcText.length() - 1) {
                            if ((!Character.isWhitespace(srcText.charAt(cur + 1)))
                                    && (srcText.charAt(cur + 1) != '\u00A0')) {
                                srcText.insert(cur + 1, ' ');
                            }
                        }
                        break;
                    case '\u0021': // !
                        if (cur < srcText.length() - 1) {
                            if ((Character.isWhitespace(srcText.charAt(cur + 1)))
                                    || (srcText.charAt(cur + 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur + 1);
                            }
                        }
                        if (cur > 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        break;
                    case '\u00A1': // ¡
                        if (cur >= 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        break;
                    case '\u003F': // ?
                        if (cur < srcText.length() - 1) {
                            if ((Character.isWhitespace(srcText.charAt(cur + 1)))
                                    || (srcText.charAt(cur + 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur + 1);
                            }
                        }
                        if (cur > 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        break;
                    case '\u00BF': // ¿
                        if (cur >= 0) {
                            if ((Character.isWhitespace(srcText.charAt(cur - 1)))
                                    || (srcText.charAt(cur - 1) == '\u00A0')) {
                                srcText.deleteCharAt(cur - 1);
                                cur -= 1;
                            }
                        }
                        break;
                    default:
                        break;
                }
            }
            cur += 1;
    }
    // normalize target
    cur  = 0;
    while (cur

    <= trgText.length () 
        - 1) {
            ch = trgText.charAt(cur);
        if (PUNCTUATION.indexOf(ch) != -1) {
            switch (ch) {
                case '\u002E': // .
                    if (cur < trgText.length() - 1) {
                        if ((Character.isWhitespace(trgText.charAt(cur + 1)))
                                || (trgText.charAt(cur + 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur + 1);
                        }
                    }
                    if (cur > 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            if (cur == trgText.length() - 1) {
                                trgText.deleteCharAt(cur - 1);
                                cur -= 1;
                            } else if ((cur < trgText.length() - 1)
                                    && (Character.isDigit(trgText.charAt(cur + 1)))) {
                                trgText.deleteCharAt(cur - 1);
                                cur -= 1;
                            } else {
                                break;
                            }
                        }
                    }
                    break;
                case '\u002C': // ,
                    if (cur > 0) {
                        if ((cur < trgText.length() - 1)
                                && (!Character.isDigit(trgText.charAt(cur + 1)))) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        } else {
                            break;
                        }
                    }
                    if (cur < trgText.length() - 1) {
                        if ((!Character.isWhitespace(trgText.charAt(cur + 1)))
                                && (trgText.charAt(cur + 1) != '\u00A0')) {
                            trgText.insert(cur + 1, ' ');
                        }
                    }
                    break;
                case '\u003B': // ;
                    if (cur > 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    if (cur < trgText.length() - 1) {
                        if ((!Character.isWhitespace(trgText.charAt(cur + 1)))
                                && (trgText.charAt(cur + 1) != '\u00A0')) {
                            trgText.insert(cur + 1, ' ');
                        }
                    }
                    break;
                case '\u003A': // :
                    if (cur > 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    if (cur < trgText.length() - 1) {
                        if ((!Character.isWhitespace(trgText.charAt(cur + 1)))
                                && (trgText.charAt(cur + 1) != '\u00A0')) {
                            trgText.insert(cur + 1, ' ');
                        }
                    }
                    break;
                case '\u0021': // !
                    if (cur < trgText.length() - 1) {
                        if ((Character.isWhitespace(trgText.charAt(cur + 1)))
                                || (trgText.charAt(cur + 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur + 1);
                        }
                    }
                    if (cur > 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    break;
                case '\u00A1': // ¡
                    if (cur >= 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    break;
                case '\u003F': // ?
                    if (cur < trgText.length() - 1) {
                        if ((Character.isWhitespace(trgText.charAt(cur + 1)))
                                || (trgText.charAt(cur + 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur + 1);
                        }
                    }
                    if (cur > 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    break;
                case '\u00BF': // ¿
                    if (cur >= 0) {
                        if ((Character.isWhitespace(trgText.charAt(cur - 1)))
                                || (trgText.charAt(cur - 1) == '\u00A0')) {
                            trgText.deleteCharAt(cur - 1);
                            cur -= 1;
                        }
                    }
                    break;
                default:
                    break;
            }
        }
        cur += 1;
    }

    // save updated strings
    srcFrag.setCodedText (srcText.toString

    ());
        trgFrag.setCodedText (trgText.toString

());
    }

    /**
     * Effectively marks the segment for removal by emptying the content for the
     * given target. the text unit will be pruned by a different method
     * ({@link #pruneTextUnit(ITextUnit, LocaleId)}).
     *
     * @param tu the text unit containing the content
     * @param seg the segment to be marked for removal
     * @param targetLocale the locale for which the segment should be removed
     */
    protected void markSegmentForRemoval(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        tu.getTargetSegment(targetLocale, seg.getId(), false).getContent().clear();
    }

    /**
     * Marks segments for removal that contain text which match given regular
     * expressions. Allows for marking segments which match user specified
     * regular expressions.
     *
     * @param tu the text unit containing the segments to be matched
     * @param seg the segment to analyze
     * @param targetLocale the locale
     */
    protected void matchRegexExpressions(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        Pattern pattern;
        StringBuilder srcText = new StringBuilder(seg.text);
        StringBuilder trgText = new StringBuilder(tu.getTargetSegment(targetLocale, seg.getId(), false).text);
        boolean alreadyFound = false;

        // match user specified regex expression
        if (params.getMatchUserRegex() == true) {
            if ((params.getUserRegex() != null) && !params.getUserRegex().isEmpty()) {
                // compile user string
                try {
                    pattern = Pattern.compile(params.getUserRegex());

                    // find matching text
                    if ((pattern.matcher(srcText).find()) || (pattern.matcher(trgText).find())) {
                        alreadyFound = true;
                        markSegmentForRemoval(tu, seg, targetLocale);
                    }
                } catch (PatternSyntaxException patException) {
                    LOGGER.error("The following error occured \"{}\" in the expression: {}.", patException.getDescription(), patException.getPattern());
                }
            }
        }
        // skip "standard" regex expressions if already marked for removal
        if (alreadyFound != true) {
            // TODO: other "standard" regex expressions to match against.
        }

    }

    /**
     * Removes segments from the text unit marked as not containing useful
     * information.
     *
     * @param tu text unit to be pruned of unwanted segments
     * @param targetLocale locale of target through which to search
     * @return true if entire text unit is to be discarded false if text unit
     * contains good translated text
     */
    protected boolean pruneTextUnit(ITextUnit tu, LocaleId targetLocale) {

        if (!tu.isEmpty()) {
            TextContainer tc = tu.getSource();
            ISegments srcSegs = tc.getSegments();
            int cursor = 0;

            while (cursor <= srcSegs.count() - 1) {
                // get segments
                Segment srcSeg = srcSegs.get(cursor);
                Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);

                // check for segments to remove 
                if (cursor < srcSegs.count() - 1) {
                    if (trgSeg.text.isEmpty()) {
                        tc.remove(srcSegs.getIndex(srcSeg.getId()));
                        continue;
                    }
                } else {
                    // last segment in tu
                    if (trgSeg.text.isEmpty()) {
                        return true;
                    }
                }
                // increment
                cursor += 1;
            }
        } else {
            return true;
        }
        return false;
    }

    /**
     * Wrapper for removing character corruption and detecting unexpected
     * characters.
     *
     * @param tu: the TextUnit containing the segments to update
     * @param seg: the Segment to update
     * @param targetLocale: the language for which the text should be updated
     */
    protected void checkCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        // check for standard corruption
        removeCorruptions(tu, seg, targetLocale);

        // check for strings of unusual characters
        checkUnusualCharacters(tu, seg, targetLocale);
    }

    /**
     * Attempts to detect character corruption from either the source or target.
     * If any corruption are detected, the segment is marked for removal.
     *
     * @param tu the text unit to be modified
     * @param seg the source segment to be modified
     * @param targetLocale the locale used to fetch the target text
     */
    private void removeCorruptions(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());

        Matcher matcher;
        String corruptionRegex = "\\u00C3[\\u00A4-\\u00B6]|\\u00C3\\u201E|\\u00C3\\u2026|\\u00C3\\u2013";

        // find corruption in source
        matcher = Pattern.compile(corruptionRegex).matcher(srcText);
        if (matcher.find() == true) {
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }

        // find corruption in target
        matcher = Pattern.compile(corruptionRegex).matcher(trgText);
        if (matcher.find() == true) {
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }
    }

    private void checkUnusualCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) {

        TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());

        boolean isFound = false;
        Pattern pattern;

        // any 3 extended characters together 
        pattern = Pattern.compile("[\\u00C0-\\u00FF]{3}");

        // check source
        if ((pattern.matcher(srcText).find() == true) && (isFound == false)) {
            isFound = true;
            markSegmentForRemoval(tu, seg, targetLocale);
        }

        // check target
        if ((pattern.matcher(trgText).find() == true) && (isFound == false)) {
            isFound = true;
            markSegmentForRemoval(tu, seg, targetLocale);
        }

    }

    private void checkCharacterSet(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        CharsetEncoder encoder1 = null;
        CharsetEncoder encoder2 = null;
        Pattern extraCharsAllowed = null;
        Pattern itsAllowedChars = null;
        String itsAllowedCharsPattern = "\u0000";

        StringBuilder trgOri = new StringBuilder(tu.getTargetSegment(targetLocale, seg.getId(), false).text);

        StringBuilder badChars = new StringBuilder();
        int pos = -1;
        int badChar = 0;
        int count = 0;

        // this needs to be set some how from targetLocale
        String charsetName = null;

        if (!Util.isEmpty(charsetName)) {
            encoder1 = Charset.forName(charsetName).newEncoder();
        }
        // Extra characters allowed
        // these needs to be set some how
        boolean allowExtraCharacters = false;
        String getExtraCharsAllowed = "";
        if (!allowExtraCharacters) {
            extraCharsAllowed = Pattern.compile(getExtraCharsAllowed);
        }

        for (int i = 0; i < trgOri.length(); i++) {
            char ch = trgOri.charAt(i);

            if (encoder1 != null) {
                if (encoder1.canEncode(ch)) {
                    continue; // Allowed, move to the next character
                } else { // Not included in the target charset
                    // Check if it is included in the extra characters
                    // list
                    if (extraCharsAllowed != null) {
                        Matcher m = extraCharsAllowed.matcher(trgOri.subSequence(i, i + 1));
                        if (m.find()) {
                            // Part of the extra character list: it's OK
                            continue; // Move to the next character
                        }
                        // Else: not allowed: fall thru
                    }
                }
            } else { // Not charset defined, try just the extra characters list
                if (extraCharsAllowed != null) {
                    Matcher m = extraCharsAllowed.matcher(trgOri.subSequence(i, i + 1));
                    if (m.find()) {
                        // Part of the extra character list: it's OK
                        continue; // Move to the next character
                    }
                    // Else: not allowed: fall thru
                }
                // Else: not in charset, nor in extra characters list: not
                // allowed
            }

            // The character is not allowed: add the error
            if (++count > 1) {
                if (badChars.indexOf(String.valueOf(ch)) == -1) {
                    badChars.append(ch);
                }
            } else {
                pos = i;
                badChar = ch;
            }
        }

        // Do we have one or more errors?
//	       if (pos > -1) {
//            if (count > 1) {
//                reportIssue(IssueType.ALLOWED_CHARACTERS, tu, null, String.format("The character '%c' (U+%04X) is not allowed in the target text."
//                        + " Other forbidden characters found: ", badChar, (int) badChar)
//                        + badChars.toString(), 0, -1, pos, pos + 1, Issue.SEVERITY_MEDIUM, srcOri, trgOri, null);
//            } else {
//                reportIssue(IssueType.ALLOWED_CHARACTERS, tu, null, String.format("The character '%c' (U+%04X) is not allowed in the target text.", badChar, (int) badChar), 0, -1, pos, pos + 1, Issue.SEVERITY_MEDIUM, srcOri, trgOri, null);
//            }
//        }
    }
// end class
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy