info.bliki.wiki.filter.WikipediaScanner Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of bliki-core Show documentation
This is the core project.
The newest version!
package info.bliki.wiki.filter;

import info.bliki.wiki.model.ITableOfContent;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.WikiTagNode;

import java.util.ArrayList;
import java.util.List;

public class WikipediaScanner {
    public static final int EOF = -1;
    protected int fScannerPosition;
    protected IWikiModel fWikiModel;

    /**
     * The String of the given raw wiki text
     */
    protected final String fStringSource;

    /**
     * The corresponding char[] array for the string source
     */
    protected final char[] fSource;

    public WikipediaScanner(String src) {
        this(src, 0);
    }

    public WikipediaScanner(String src, int position) {
        fSource = src.toCharArray();
        fStringSource = src;
        fScannerPosition = position;
    }

    public void setModel(IWikiModel wikiModel) {
        fWikiModel = wikiModel;
    }

    public int getPosition() {
        return fScannerPosition;
    }

    public void setPosition(int newPos) {
        fScannerPosition = newPos;
    }

    /**
     * Scan a wikipedia table.
     *
     * See: Help - Table
     *
     * @param tableOfContentTag
     * @return null if no wiki table was found
     */
    public WPTable wpTable(ITableOfContent tableOfContentTag) {
        WPTable table = null;
        WPCell cell = null;
        ArrayList cells = new ArrayList<>();
        WPRow row = new WPRow(cells);
        try {
            if (fScannerPosition < 0) {
                // simulate newline
                fScannerPosition = 0;
            }
            if (fSource[fScannerPosition++] != '{') {
                return null;
            }
            if (fSource[fScannerPosition++] != '|') {
                return null;
            }
            ArrayList rows = new ArrayList<>();
            table = new WPTable(rows);
            int startPos = fScannerPosition;
            // read parameters until end of line
            nextNewline();
            table.setParams(fStringSource.substring(startPos, fScannerPosition));

            char ch = ' ';

            while (true) {
                ch = fSource[fScannerPosition++];
                switch (ch) {
                case '[':
                    int position = findNestedEndSingle(fSource, '[', ']', fScannerPosition);
                    if (position >= 0) {
                        fScannerPosition = position;
                        continue;
                    }
                    break;
                case '{':
                    int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition);
                    if (cposition >= 0) {
                        fScannerPosition = cposition;
                        continue;
                    }
                    break;
                case '\n':
                    ch = fSource[fScannerPosition++];
                    // ignore whitespace at the beginning of the line
                    while (ch == ' ' || ch == '\t') {
                        ch = fSource[fScannerPosition++];
                    }
                    switch (ch) {
                    case '|': // "\n |"
                        if (cell != null) {
                            cell.createTagStack(table, fSource, fWikiModel, fScannerPosition - 2);
                            cell = null;
                        }

                        ch = fSource[fScannerPosition++];
                        switch (ch) {
                        case '-': // new row - "\n|-"
                            addTableRow(table, row);
                            cells = new ArrayList<>();
                            row = new WPRow(cells);
                            startPos = fScannerPosition;
                            nextNewlineCell(cell);
                            row.setParams(fStringSource.substring(startPos, fScannerPosition));
                            break;
                        case '+': // new row - "\n|+"
                            addTableRow(table, row);
                            cells = new ArrayList<>();
                            row = new WPRow(cells);
                            row.setType(WPCell.CAPTION);
                            cell = new WPCell(fScannerPosition);
                            cell.setType(WPCell.CAPTION);
                            cells.add(cell);
                            nextNewlineCell(cell);
                            cell.createTagStack(table, fSource, fWikiModel, fScannerPosition);
                            cell = null;

                            addTableRow(table, row);
                            cells = new ArrayList<>();
                            row = new WPRow(cells);
                            break;
                        case '}': // end of table - "\n|}"
                            addTableRow(table, row);
                            return table;
                        default:
                            fScannerPosition--;
                            cell = new WPCell(fScannerPosition);
                            cells.add(cell);
                        }

                        break;
                    case '!': // "\n !"
                        if (cell != null) {
                            cell.createTagStack(table, fSource, fWikiModel, fScannerPosition - 2);
                            cell = null;
                        }
                        ch = fSource[fScannerPosition++];
                        cell = new WPCell(fScannerPosition - 1);
                        cell.setType(WPCell.TH);
                        cells.add(cell);

                        break;
                    case '{': // "\n {"
                        if (fSource[fScannerPosition] == '|') {
                            // start of nested table?
                            fScannerPosition = indexEndOfTable();
                            break;
                        }
                        break;
                    default:
                        fScannerPosition--;
                    }
                    break;
                case '|':
                    ch = fSource[fScannerPosition++];
                    if (ch == '|') {
                        if (cell != null) {
                            cell.createTagStack(table, fSource, fWikiModel, fScannerPosition - 2);
                            cell = null;
                        }
                        cell = new WPCell(fScannerPosition);
                        cells.add(cell);
                    } else {
                        fScannerPosition--;
                        if (cell != null) {
                            cell.setAttributesStartPos(fScannerPosition - 1);
                        }
                    }
                    break;
                case '!':
                    ch = fSource[fScannerPosition++];
                    if (ch == '!') {
                        if (cell != null) {
                            cell.createTagStack(table, fSource, fWikiModel, fScannerPosition - 2);
                            cell = null;
                        }
                        cell = new WPCell(fScannerPosition);
                        cell.setType(WPCell.TH);
                        cells.add(cell);
                    } else {
                        fScannerPosition--;
                    }
                    break;
                default:
                    if (cell == null) {
                        cell = new WPCell(fScannerPosition - 1);
                        cell.setType(WPCell.UNDEFINED);
                        cells.add(cell);
                    }
                }
            }
        } catch (IndexOutOfBoundsException e) {
            // ...
            fScannerPosition = fSource.length;
            if (cell != null) {
                cell.createTagStack(table, fSource, fWikiModel, fScannerPosition);
                cell = null;
            }
            if (table != null && row != null && row.size() > 0) {
                addTableRow(table, row);
            }
        }
        if (table != null) {
            return table;
        }
        return null;
    }

    private void addTableRow(WPTable table, WPRow row) {
        if (row.getParams() != null) {
            table.add(row);
        } else {
            if (row.size() > 0) {
                table.add(row);
            }
        }
    }

    public WPList wpList() {
        WPList list = null;
        WPListElement listElement = null;
        int startPosition;
        try {
            char ch;
            char lastCh = ' ';
            char[] sequence = null;
            int count = 0;

            if (fScannerPosition < 0) {
                // simulate newline
                fScannerPosition = 0;
                ch = '\n';
            } else {
                ch = fSource[fScannerPosition++];
            }

            list = new WPList();

            while (true) {
                if (ch == WPList.DL_DD_CHAR) {
                    if ((fScannerPosition < fSource.length - 2) && fSource[fScannerPosition] == '/' && fSource[fScannerPosition + 1] == '/') {
                        if (fScannerPosition > 1 && Character.isLetter(fSource[fScannerPosition - 2])) {
                            // definition list with URL link
                            fScannerPosition += 2;
                            ch = fSource[fScannerPosition++];
                            continue;
                        }
                    }

                    if (lastCh == WPList.DL_DT_CHAR && sequence != null) {
                        startPosition = fScannerPosition;
                        if (listElement != null) {
                            listElement.createTagStack(fSource, fWikiModel, fScannerPosition - 1);
                            list.add(listElement);
                            listElement = null;
                        }
                        char[] ddSequence = new char[sequence.length];
                        System.arraycopy(sequence, 0, ddSequence, 0, sequence.length);
                        ddSequence[sequence.length - 1] = WPList.DL_DD_CHAR;
                        sequence = ddSequence;

                        int startPos;
                        while (true) {
                            ch = fSource[fScannerPosition++];
                            if (!Character.isWhitespace(ch)) {
                                startPos = fScannerPosition - 1;
                                listElement = new WPListElement(count, sequence, startPos);
                                break;
                            }
                            if (ch == '\n') {
                                fScannerPosition--; // to detect next row
                                startPos = fScannerPosition;
                                listElement = new WPListElement(count, sequence, startPos);
                                listElement.createTagStack(fSource, fWikiModel, startPos);
                                list.add(listElement);
                                listElement = null;
                                break;
                            }
                        }
                        lastCh = ' ';
                    }
                }
                if (ch == '\n' || fScannerPosition == 0) {
                    startPosition = fScannerPosition;
                    if (listElement != null) {
                        listElement.createTagStack(fSource, fWikiModel, fScannerPosition - 1);
                        list.add(listElement);
                        listElement = null;
                    }
                    ch = fSource[fScannerPosition++];
                    switch (ch) {
                    case WPList.DL_DD_CHAR:
                    case WPList.DL_DT_CHAR:
                    case WPList.OL_CHAR:
                    case WPList.UL_CHAR:
                        count = 1;
                        lastCh = ch;
                        while (fSource[fScannerPosition] == WPList.UL_CHAR || fSource[fScannerPosition] == WPList.OL_CHAR
                                || fSource[fScannerPosition] == WPList.DL_DD_CHAR || fSource[fScannerPosition] == WPList.DL_DT_CHAR) {
                            count++;
                            lastCh = fSource[fScannerPosition++];
                        }

                        sequence = new char[count];
                        System.arraycopy(fSource, fScannerPosition - count, sequence, 0, count);

                        int startPos;
                        while (true) {
                            ch = fSource[fScannerPosition++];
                            if (!Character.isWhitespace(ch)) {
                                startPos = fScannerPosition - 1;
                                listElement = new WPListElement(count, sequence, startPos);
                                break;
                            }
                            if (ch == '\n') {
                                fScannerPosition--; // to detect next row
                                startPos = fScannerPosition;
                                listElement = new WPListElement(count, sequence, startPos);
                                listElement.createTagStack(fSource, fWikiModel, startPos);
                                list.add(listElement);
                                listElement = null;
                                break;
                            }
                        }

                        break;

                    default:
                        fScannerPosition = startPosition;
                        return list;
                    }
                }

                if (ch == '<') {
                    int temp = readSpecialWikiTags(fScannerPosition);
                    if (temp >= 0) {
                        fScannerPosition = temp;
                    }
                } else if (ch == '[') {
                    int temp = findNestedEndSingle(fSource, '[', ']', fScannerPosition);
                    if (temp >= 0) {
                        fScannerPosition = temp;
                    }
                }
                ch = fSource[fScannerPosition++];
            }
        } catch (IndexOutOfBoundsException e) {
            fScannerPosition = fSource.length + 1;
        }
        if (list != null) {
            if (listElement != null) {
                listElement.createTagStack(fSource, fWikiModel, fScannerPosition - 1);
                list.add(listElement);
                listElement = null;
            }
            return list;
        }
        return null;
    }

    public int nextNewline() {
        while (true) {
            if (fSource[fScannerPosition++] == '\n') {
                return --fScannerPosition;
            }
        }
    }

    public int nextNewlineCell(WPCell cell) {
        char ch;
        while (true) {
            ch = fSource[fScannerPosition++];
            if (ch == '\n') {
                return --fScannerPosition;
            }
            if (ch == '|') {
                if (cell != null) {
                    cell.setAttributesStartPos(fScannerPosition - 1);
                }
            } else if (ch == '[') {
                int position = findNestedEndSingle(fSource, '[', ']', fScannerPosition);
                if (position >= 0) {
                    fScannerPosition = position;
                }
            } else if (ch == '{') {
                int cposition = findNestedEndSingle(fSource, '{', '}', fScannerPosition);
                if (cposition >= 0) {
                    fScannerPosition = cposition;
                }
            }
        }
    }

    /**
     * Get the offset position behind the next closing HTML comment tag (-->).
     *
     * @return the offset position behind the next closing HTML comment tag or
     *         -1 if no tag could be found.
     */
    public int indexEndOfComment() {
        char ch;
        while (fScannerPosition < fSource.length - 2) {
            ch = fSource[fScannerPosition++];
            if (ch == '-' && fSource[fScannerPosition] == '-' && fSource[fScannerPosition + 1] == '>') {
                return fScannerPosition + 2;
            }
        }
        return -1;
    }

    /**
     * Get the offset position behind the next </nowiki> tag.
     *
     * @return the offset position behind the </nowiki> tag or
     *         -1 if no tag could be found.
     */
    public int indexEndOfNowiki() {
        char ch;
        while (fScannerPosition < fSource.length - 8) {
            ch = fSource[fScannerPosition++];
            if (ch == '<' && fSource[fScannerPosition] == '/' && fSource[fScannerPosition + 1] == 'n'
                    && fSource[fScannerPosition + 2] == 'o' && fSource[fScannerPosition + 3] == 'w' && fSource[fScannerPosition + 4] == 'i'
                    && fSource[fScannerPosition + 5] == 'k' && fSource[fScannerPosition + 6] == 'i' && fSource[fScannerPosition + 7] == '>') {
                return fScannerPosition + 8;
            }
        }
        return -1;
    }

    /**
     * Get the offset position behind the corresponding wiki table closing tag
     * (i.e. |}). The scanner detects HTML comment tags,
     * <nowiki> tags and nested wiki table tags (i.e.
     * {|... {|...  ...|} ...|}).
     *
     * @return the offset position behind the corresponding wiki table closing tag
     *         or -1 if no corresponding tag could be found.
     */
    public int indexEndOfTable() {
        // check nowiki and html comments
        int nestedWikiTableCounter = 1;
        char ch;
        try {
            while (fScannerPosition < fSource.length) {
                ch = fSource[fScannerPosition++];
                if (ch == '<' && fSource[fScannerPosition] == '!' && fSource[fScannerPosition + 1] == '-'
                        && fSource[fScannerPosition + 2] == '-') {
                    // start of HTML comment
                    fScannerPosition += 3;
                    fScannerPosition = indexEndOfComment();
                    if (fScannerPosition == (-1)) {
                        return -1;
                    }
                } else if (ch == '<' && fSource[fScannerPosition] == 'n' && fSource[fScannerPosition + 1] == 'o'
                        && fSource[fScannerPosition + 2] == 'w' && fSource[fScannerPosition + 3] == 'i' && fSource[fScannerPosition + 4] == 'k'
                        && fSource[fScannerPosition + 5] == 'i' && fSource[fScannerPosition + 6] == '>') {
                    // start of 
                    fScannerPosition += 7;
                    fScannerPosition = indexEndOfNowiki();
                    if (fScannerPosition == (-1)) {
                        return -1;
                    }
                } else if (ch == '\n' && fSource[fScannerPosition] == '{' && fSource[fScannerPosition + 1] == '|') {
                    // assume nested table
                    nestedWikiTableCounter++;
                } else if (ch == '\n') {
                    int oldPosition = fScannerPosition;
                    ch = fSource[fScannerPosition++];
                    // ignore SPACES and TABs at the beginning of the line
                    while (ch == ' ' || ch == '\t') {
                        ch = fSource[fScannerPosition++];
                    }
                    if (ch == '|' && fSource[fScannerPosition] == '}') {
                        if (--nestedWikiTableCounter == 0) {
                            return fScannerPosition + 1;
                        }
                    }
                    fScannerPosition = oldPosition;
                }
            }
        } catch (IndexOutOfBoundsException e) {
            // ..
        }
        return -1;
    }

    /**
     * 
     * Check if a String starts with a specified prefix (optionally case
     * insensitive).
     * 
     *
     * @see java.lang.String#startsWith(String)
     * @param str
     *          the String to check, may be null
     * @param toffset
     *          the starting offset of the subregion the String to check
     * @param prefix
     *          the prefix to find, may be null
     * @param ignoreCase
     *          inidicates whether the compare should ignore case (case
     *          insensitive) or not.
     * @return true if the String starts with the prefix or both
     *         null
     */
    public static boolean startsWith(String str, int toffset, String prefix, boolean ignoreCase) {
        if (str == null || prefix == null) {
            return (str == null && prefix == null);
        }
        if (prefix.length() > str.length() - toffset) {
            return false;
        }
        return str.regionMatches(ignoreCase, toffset, prefix, 0, prefix.length());
    }



    /**
     * Split the given src string by pipe symbol (i.e. "|")
     *
     * @param sourceString
     * @param resultList
     *          the list which contains the splitted strings
     * @return splitted strings
     */
    public static List splitByPipe(String sourceString, List resultList) {
        return splitByChar('|', sourceString, resultList, -1);
    }

    /**
     * Split the given srcArray character array by pipe symbol (i.e.
     * "|").
     *
     * @param srcArray
     *          the array to split
     * @param currOffset
     *          start position in srcArray
     * @param endOffset
     *          end position in srcArray
     * @param resultList
     *          the list which contains the splitted strings
     *
     * @return splitted strings
     */
    public static List splitByPipe(char[] srcArray, int currOffset, int endOffset, List resultList) {
        return splitByChar('|', srcArray, currOffset, endOffset, resultList, -1);
    }

    /**
     * Split the given src string by pipe symbol (i.e. "|")
     *
     * @param splitChar
     *          the character to split by
     * @param sourceString
     *          the string to split
     * @param resultList
     *          the list which contains the splitted strings
     * @param maxParts
     *          max number of parts to split the source into (less than 0
     *          for infinite number of parts, otherwise only values greater than
     *          0 allowed!)
     * @return splitted strings
     */
    public static List splitByChar(final char splitChar, String sourceString, List resultList, final int maxParts) {
        // TODO optimize this to avoid new char[] generation inside toCharArray() ?
        return splitByChar(splitChar, sourceString.toCharArray(), 0, sourceString.length(), resultList, maxParts);
    }

    /**
     * Split the given srcArray character array by the given
     * character.
     *
     * @param splitChar
     *          the character to split by
     * @param srcArray
     *          the array to split
     * @param currOffset
     *          start position in srcArray
     * @param endOffset
     *          end position in srcArray
     * @param resultList
     *          the list which contains the splitted strings
     * @param maxParts
     *          max number of parts to split the source into (less than 0
     *          for infinite number of parts, otherwise only values greater than
     *          0 allowed!)
     *
     * @return splitted strings
     */
    protected static List splitByChar(final char splitChar, char[] srcArray, int currOffset, int endOffset,
            List resultList, final int maxParts) {
        assert (maxParts != 0 && maxParts != 1); // this doesn't make any sense!
        if (resultList == null) {
            resultList = new ArrayList<>();
        }
        char ch;
        int[] temp = new int[] { -1, -1 };

        int lastOffset = currOffset;
        try {
            while (currOffset < endOffset) {
                ch = srcArray[currOffset++];
                if (ch == '[' && srcArray[currOffset] == '[') {
                    currOffset++;
                    temp[0] = findNestedEnd(srcArray, '[', ']', currOffset);
                    if (temp[0] >= 0) {
                        currOffset = temp[0];
                    }
                } else if (ch == '{' && srcArray[currOffset] == '{') {
                    currOffset++;
                    if (srcArray[currOffset] == '{' && srcArray[currOffset + 1] != '{') {
                        currOffset++;
                        temp = findNestedParamEnd(srcArray, currOffset);
                        if (temp[0] >= 0) {
                            currOffset = temp[0];
                        }
                    } else {
                        temp[0] = findNestedTemplateEnd(srcArray, currOffset);
                        if (temp[0] >= 0) {
                            currOffset = temp[0];
                        }
                    }
                } else if (ch == splitChar) {
                    if (maxParts > 0 && resultList.size() >= maxParts - 1) {
                        // take rest and put it into the last part
                        currOffset = endOffset;
                        break;
                    }
                    resultList.add(new String(srcArray, lastOffset, currOffset - lastOffset - 1));
                    lastOffset = currOffset;
                }
            }

            if (currOffset > lastOffset) {
                resultList.add(new String(srcArray, lastOffset, currOffset - lastOffset));
            } else if (currOffset == lastOffset) {
                resultList.add("");
            }
        } catch (IndexOutOfBoundsException e) {
            if (currOffset > lastOffset) {
                resultList.add(new String(srcArray, lastOffset, currOffset - lastOffset));
            } else if (currOffset == lastOffset) {
                resultList.add("");
            }
        }
        return resultList;
    }

    /**
     * Read until the end of a nested block i.e. something like
     * [[...[[  ]]...]]
     *
     * @param sourceArray
     * @param startCh
     * @param endChar
     * @param startPosition
     * @return the position of the nested end charcters or -1 if not
     *         found
     */
    public static int findNestedEnd(final char[] sourceArray, final char startCh, final char endChar, int startPosition) {
        char ch;
        int level = 1;
        int position = startPosition;
        final int sourceArrayLength = sourceArray.length - 1;
        try {
            while (position < sourceArrayLength) {
                ch = sourceArray[position++];
                if (ch == startCh && sourceArray[position] == startCh) {
                    position++;
                    level++;
                } else if (ch == endChar && sourceArray[position] == endChar) {
                    position++;
                    if (--level == 0) {
                        return position;
                    }
                }
            }
            return -1;
        } catch (IndexOutOfBoundsException e) {
            return -1;
        }
    }

    /**
     * Read until the end of a nested block i.e. something like
     * {{{...{...{{  }}...}...}}}
     *
     * @param sourceArray
     * @param startCh
     * @param endChar
     * @param startPosition
     * @return the position of the nested end charcters or -1 if not
     *         found
     */
    public static int findNestedEndSingle(final char[] sourceArray, final char startCh, final char endChar, int startPosition) {
        char ch;
        int level = 1;
        int position = startPosition;
        final int sourceArrayLength = sourceArray.length;
        try {
            while (position < sourceArrayLength) {
                ch = sourceArray[position++];
                if (ch == startCh) {
                    level++;
                } else if (ch == endChar) {
                    if (--level == 0) {
                        return position;
                    }
                }
            }
            return -1;
        } catch (IndexOutOfBoundsException e) {
            return -1;
        }
    }

    /**
     * @return the template end position or -1 if there is no end.
     * TODO: This logic needs to be improved, there are very likely cases where this does not work as
     * intended. Template parameters can take {} as well, e.g. {{foo:bar|{|}|{}|baz}}.
     */
    public static int findNestedTemplateEnd(final char[] sourceArray, int startPosition) {
        char ch;
        int countOpenBraces = 0;
        int position = startPosition;
        try {
            while (position < sourceArray.length) {
                ch = sourceArray[position++];
                switch (ch) {
                    case '{':
                        if (sourceArray[position - 2] == '{' || sourceArray[position] == '{') {
                            countOpenBraces++;
                        }
                        break;
                    case '}':
                        if (sourceArray[position - 2] != '}' &&  sourceArray[position] != '}')
                            break;

                        if (countOpenBraces > 0) {
                            countOpenBraces--;
                        } else {
                            return position + 1;
                        }
                        break;
                }
            }
            return -1;
        } catch (IndexOutOfBoundsException e) {
            return -1;
        }
    }

    /**
     * Find the end of a template parameter declaration or the end of a template
     * declaration.
     *
     * @param sourceArray
     * @param startPosition
     * @return an array of two integers. If array[0] > 0 the scanner
     *         has found the end position of a template parameter declaration. If
     *         array[1] > 0 the scanner has found the end position of
     *         a template declaration.
     */
    public static int[] findNestedParamEnd(final char[] sourceArray, int startPosition) {
        char ch;
        final int sourceArrayLength = sourceArray.length;
        int countSingleOpenBraces = 0;
        int parameterPosition = startPosition;
        try {
            while (parameterPosition < sourceArrayLength) {
                ch = sourceArray[parameterPosition++];
                if (ch == '{') {
                    if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '{') {
                        parameterPosition++;
                        if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '{'
                                && sourceArray[parameterPosition + 1] != '{') {
                            // template parameter beginning
                            parameterPosition++;
                            int[] temp = findNestedParamEnd(sourceArray, parameterPosition);
                            if (temp[0] >= 0) {
                                parameterPosition = temp[0];
                            } else {
                                if (temp[1] >= 0) {
                                    parameterPosition = temp[1];
                                } else {
                                    return new int[] { -1, -1 };
                                }
                            }
                        } else {
                            // template beginning
                            int temp = findNestedTemplateEnd(sourceArray, parameterPosition);
                            if (temp < 0) {
                                return new int[] { -1, -1 };
                            }
                            parameterPosition = temp;
                        }
                    } else {
                        countSingleOpenBraces++;
                    }
                } else if (ch == '}') {
                    if (countSingleOpenBraces > 0) {
                        countSingleOpenBraces--;
                    } else {
                        if ((sourceArrayLength > parameterPosition) && sourceArray[parameterPosition] == '}') {
                            if (sourceArray[parameterPosition + 1] == '}') {
                                // template parameter ending
                                return new int[] { parameterPosition + 2, -1 };
                            } else {
                                return new int[] { -1, parameterPosition + 1 };
                            }
                        }
                    }

                }
            }
            return new int[] { -1, -1 };
        } catch (IndexOutOfBoundsException e) {
            return new int[] { -1, -1 };
        }
    }

    /**
     * Parse a tag. Parse the name and attributes from a start tag.
     * 
     * From the 
     * HTML 4.01 Specification, W3C Recommendation 24 December 1999
     * http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2
     * 

     *  3.2.2 Attributes
     * 

     * Elements may have associated properties, called attributes, which may have
     * values (by default, or set by authors or scripts). Attribute/value pairs
     * appear before the final ">" of an element's start tag. Any number of
     * (legal) attribute value pairs, separated by spaces, may appear in an
     * element's start tag. They may appear in any order.
     * 

     * In this example, the id attribute is set for an H1 element: 
     * <H1 id="section1">
     *  This is
     * an identified heading thanks to the id attribute 
     * </H1>
     *  By default, SGML
     * requires that all attribute values be delimited using either double
     * quotation marks (ASCII decimal 34) or single quotation marks (ASCII decimal
     * 39). Single quote marks can be included within the attribute value when the
     * value is delimited by double quote marks, and vice versa. Authors may also
     * use numeric character references to represent double quotes (&#34;) and
     * single quotes (&#39;). For doublequotes authors can also use the
     * character entity reference &quot;.
     * 

     * In certain cases, authors may specify the value of an attribute without any
     * quotation marks. The attribute value may only contain letters (a-z and
     * A-Z), digits (0-9), hyphens (ASCII decimal 45), periods (ASCII decimal 46),
     * underscores (ASCII decimal 95), and colons (ASCII decimal 58). We recommend
     * using quotation marks even when it is possible to eliminate them.
     * 

     * Attribute names are always case-insensitive.
     * 

     * Attribute values are generally case-insensitive. The definition of each
     * attribute in the reference manual indicates whether its value is
     * case-insensitive.
     * 

     * All the attributes defined by this specification are listed in the
     * attribute index.
     * 

     * 
     * 

     * This method uses a state machine with the following states:
     * 

     * state 0 - outside of any attribute
     * state 1 - within attributre name
     * state 2 - equals hit
     * state 3 - within naked attribute value.
     * state 4 - within single quoted attribute value
     * state 5 - within double quoted attribute value
     * state 6 - whitespaces after attribute name could lead to state 2 (=)or
     * state 0
     * 
     * 
     * The starting point for the various components is stored in an array of
     * integers that match the initiation point for the states one-for-one, i.e.
     * bookmarks[0] is where state 0 began, bookmarks[1] is where state 1 began,
     * etc. Attributes are stored in a Vector having one slot for
     * each whitespace or attribute/value pair. The first slot is for attribute
     * name (kind of like a standalone attribute).
     *
     * @param start
     *          The position at which to start scanning.
     * @return The parsed tag.
     * @exception ParserException
     *              If a problem occurs reading from the source.
     */
    protected WikiTagNode parseTag(int start) {
        boolean done;
        char ch;
        int state;
        int[] bookmarks;

        done = false;
        ArrayList attributes = new ArrayList<>();
        state = 0;
        fScannerPosition = start;
        bookmarks = new int[8];
        bookmarks[0] = fScannerPosition;
        try {
            while (!done) {
                bookmarks[state + 1] = fScannerPosition;
                ch = fSource[fScannerPosition++];
                switch (state) {
                case 0: // outside of any attribute
                    if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
                        if ('<' == ch) {
                            // don't consume the opening angle
                            bookmarks[state + 1] = --fScannerPosition;
                        }
                        whitespace(attributes, bookmarks);
                        done = true;
                    } else if (!Character.isWhitespace(ch)) {
                        whitespace(attributes, bookmarks);
                        state = 1;
                    }
                    break;
                case 1: // within attribute name
                    if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
                        if ('<' == ch) {
                            // don't consume the opening angle
                            bookmarks[state + 1] = --fScannerPosition;
                        }
                        standalone(attributes, bookmarks);
                        done = true;
                    } else if (Character.isWhitespace(ch)) {
                        // whitespaces might be followed by next attribute or an
                        // equal sign
                        // see Bug #891058 Bug in lexer.
                        bookmarks[6] = bookmarks[2]; // setting the
                        // bookmark[0]
                        // is done in state 6 if
                        // applicable
                        state = 6;
                    } else if ('=' == ch)
                        state = 2;
                    break;
                case 2: // equals hit
                    if ((EOF == ch) || ('>' == ch)) {
                        empty(attributes, bookmarks);
                        done = true;
                    } else if ('\'' == ch) {
                        state = 4;
                        bookmarks[4] = bookmarks[3];
                    } else if ('"' == ch) {
                        state = 5;
                        bookmarks[5] = bookmarks[3];
                    } else if (Character.isWhitespace(ch)) {
                        // collect white spaces after "=" into the assignment
                        // string;
                        // do nothing
                        // see Bug #891058 Bug in lexer.
                    } else
                        state = 3;
                    break;
                case 3: // within naked attribute value
                    if ((EOF == ch) || ('>' == ch)) {
                        naked(attributes, bookmarks);
                        done = true;
                    } else if (Character.isWhitespace(ch)) {
                        naked(attributes, bookmarks);
                        bookmarks[0] = bookmarks[4];
                        state = 0;
                    } else if (ch == '/' && fSource[fScannerPosition] == '>') {
                        naked(attributes, bookmarks);
                        bookmarks[0] = bookmarks[4];
                        fScannerPosition--;
                        state = 0;
                    }
                    break;
                case 4: // within single quoted attribute value
                    if (EOF == ch) {
                        single_quote(attributes, bookmarks);
                        done = true; // complain?
                    } else if ('\'' == ch) {
                        single_quote(attributes, bookmarks);
                        bookmarks[0] = bookmarks[5] + 1;
                        state = 0;
                    }
                    break;
                case 5: // within double quoted attribute value
                    if (EOF == ch) {
                        double_quote(attributes, bookmarks);
                        done = true; // complain?
                    } else if ('"' == ch) {
                        double_quote(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6] + 1;
                        state = 0;
                    }
                    break;
                case 6: // undecided for state 0 or 2
                    // we have read white spaces after an attributte name
                    if (EOF == ch) {
                        // same as last else clause
                        standalone(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6];
                        // mPage.ungetCharacter(mCursor);
                        --fScannerPosition;
                        state = 0;
                    } else if (Character.isWhitespace(ch)) {
                        // proceed
                    } else if ('=' == ch) {// yepp. the white spaces belonged to the
                        // equal.
                        bookmarks[2] = bookmarks[6];
                        bookmarks[3] = bookmarks[7];
                        state = 2;
                    } else {
                        // white spaces were not ended by equal
                        // meaning the attribute was a stand alone attribute
                        // now: create the stand alone attribute and rewind
                        // the cursor to the end of the white spaces
                        // and restart scanning as whitespace attribute.
                        standalone(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6];
                        --fScannerPosition;
                        state = 0;
                    }
                    break;
                default:
                    throw new IllegalStateException("how did we get in state " + state);
                }
            }
            if (fSource[fScannerPosition - 1] != '>') {
                fScannerPosition = start;
                return null;
            }
            return (makeTag(start, fScannerPosition, attributes));
        } catch (IndexOutOfBoundsException e) {

            if (state == 3) {
                // within naked attribute value
                naked(attributes, bookmarks);
            }
        }
        fScannerPosition = start;
        return null;
    }

    protected List parseAttributes(int start, int end) {
        boolean done;
        char ch;
        int state;
        int[] bookmarks;

        done = false;
        ArrayList attributes = new ArrayList<>();
        state = 0;
        fScannerPosition = start;
        bookmarks = new int[8];
        bookmarks[0] = fScannerPosition;
        try {
            while (!done && fScannerPosition < end) {
                bookmarks[state + 1] = fScannerPosition;
                ch = fSource[fScannerPosition++];
                switch (state) {
                case 0: // outside of any attribute
                    if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
                        if ('<' == ch) {
                            // don't consume the opening angle
                            bookmarks[state + 1] = --fScannerPosition;
                        }
                        whitespace(attributes, bookmarks);
                        done = true;
                    } else if (!Character.isWhitespace(ch)) {
                        whitespace(attributes, bookmarks);
                        state = 1;
                    }
                    break;
                case 1: // within attribute name
                    if ((EOF == ch) || ('>' == ch) || ('<' == ch)) {
                        if ('<' == ch) {
                            // don't consume the opening angle
                            bookmarks[state + 1] = --fScannerPosition;
                        }
                        standalone(attributes, bookmarks);
                        done = true;
                    } else if (Character.isWhitespace(ch)) {
                        // whitespaces might be followed by next attribute or an
                        // equal sign
                        // see Bug #891058 Bug in lexer.
                        bookmarks[6] = bookmarks[2]; // setting the
                        // bookmark[0]
                        // is done in state 6 if
                        // applicable
                        state = 6;
                    } else if ('=' == ch)
                        state = 2;
                    break;
                case 2: // equals hit
                    if ((EOF == ch) || ('>' == ch)) {
                        empty(attributes, bookmarks);
                        done = true;
                    } else if ('\'' == ch) {
                        state = 4;
                        bookmarks[4] = bookmarks[3];
                    } else if ('"' == ch) {
                        state = 5;
                        bookmarks[5] = bookmarks[3];
                    } else if (Character.isWhitespace(ch)) {
                        // collect white spaces after "=" into the assignment
                        // string;
                        // do nothing
                        // see Bug #891058 Bug in lexer.
                    } else
                        state = 3;
                    break;
                case 3: // within naked attribute value
                    if ((EOF == ch) || ('>' == ch)) {
                        naked(attributes, bookmarks);
                        done = true;
                    } else if (Character.isWhitespace(ch)) {
                        naked(attributes, bookmarks);
                        bookmarks[0] = bookmarks[4];
                        state = 0;
                    }
                    break;
                case 4: // within single quoted attribute value
                    if (EOF == ch) {
                        single_quote(attributes, bookmarks);
                        done = true; // complain?
                    } else if ('\'' == ch) {
                        single_quote(attributes, bookmarks);
                        bookmarks[0] = bookmarks[5] + 1;
                        state = 0;
                    }
                    break;
                case 5: // within double quoted attribute value
                    if (EOF == ch) {
                        double_quote(attributes, bookmarks);
                        done = true; // complain?
                        // } else if ('\\' == ch && fSource[fScannerPosition] == '"') {
                        // fScannerPosition++;
                    } else if ('"' == ch) {
                        double_quote(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6] + 1;
                        state = 0;
                    }
                    break;
                // patch for lexer state correction by
                // Gernot Fricke
                // See Bug # 891058 Bug in lexer.
                case 6: // undecided for state 0 or 2
                    // we have read white spaces after an attributte name
                    if (EOF == ch) {
                        // same as last else clause
                        standalone(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6];
                        // mPage.ungetCharacter(mCursor);
                        --fScannerPosition;
                        state = 0;
                    } else if (Character.isWhitespace(ch)) {
                        // proceed
                    } else if ('=' == ch) // yepp. the white spaces belonged
                    // to the equal.
                    {
                        bookmarks[2] = bookmarks[6];
                        bookmarks[3] = bookmarks[7];
                        state = 2;
                    } else {
                        // white spaces were not ended by equal
                        // meaning the attribute was a stand alone attribute
                        // now: create the stand alone attribute and rewind
                        // the cursor to the end of the white spaces
                        // and restart scanning as whitespace attribute.
                        standalone(attributes, bookmarks);
                        bookmarks[0] = bookmarks[6];
                        --fScannerPosition;
                        state = 0;
                    }
                    break;
                default:
                    throw new IllegalStateException("how did we get in state " + state);
                }
            }
            if (state == 3 || state == 4 || state == 5) {
                // within naked attribute value
                bookmarks[state + 1] = fScannerPosition;
                naked(attributes, bookmarks);
            }
            return attributes;
        } catch (IndexOutOfBoundsException e) {

        }
        return null;
    }

    /**
     * Create a tag node based on the current cursor and the one provided.
     *
     * @param start
     *          The starting point of the node.
     * @param end
     *          The ending point of the node.
     * @param attributes
     *          The attributes parsed from the tag.
     * @exception ParserException
     *              If the nodefactory creation of the tag node fails.
     * @return The new Tag node.
     */
    protected WikiTagNode makeTag(int start, int end, ArrayList attributes) {
        int length;
        length = end - start;
        if (0 != length) { // return tag based on second character, '/', '%',
            // Letter (ch), '!'
            if (2 > length) {
                // this is an error
                return null; // (makeString(start, end));
            }
            return new WikiTagNode(start, end, attributes);
        }
        return null;
    }

    /**
     * Generate a whitespace 'attribute',
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void whitespace(ArrayList attributes, int[] bookmarks) {
        // if (bookmarks[1] > bookmarks[0])
        // attributes.addElement(new PageAttribute(fSource,-1, -1, bookmarks[0],
        // bookmarks[1], (char) 0));
    }

    /**
     * Generate a standalone attribute -- font.
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void standalone(ArrayList attributes, int[] bookmarks) {
        attributes.add(new NodeAttribute(fSource, bookmarks[1], bookmarks[2], -1, -1, (char) 0));
    }

    /**
     * Generate an empty attribute -- color=.
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void empty(ArrayList attributes, int[] bookmarks) {
        attributes.add(new NodeAttribute(fSource, bookmarks[1], bookmarks[2], bookmarks[2] + 1, -1, (char) 0));
    }

    /**
     * Generate an unquoted attribute -- size=1.
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void naked(ArrayList attributes, int[] bookmarks) {
        attributes.add(new NodeAttribute(fSource, bookmarks[1], bookmarks[2], bookmarks[3], bookmarks[4], (char) 0));
    }

    /**
     * Generate an single quoted attribute -- width='100%'.
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void single_quote(ArrayList attributes, int[] bookmarks) {
        attributes.add(new NodeAttribute(fSource, bookmarks[1], bookmarks[2], bookmarks[4] + 1, bookmarks[5], '\''));
    }

    /**
     * Generate an double quoted attribute -- CONTENT="Test Development".
     *
     * @param attributes
     *          The list so far.
     * @param bookmarks
     *          The array of positions.
     */
    private void double_quote(ArrayList attributes, int[] bookmarks) {
        attributes.add(new NodeAttribute(fSource, bookmarks[1], bookmarks[2], bookmarks[5] + 1, bookmarks[6], '"'));
    }

    protected int readSpecialWikiTags(int start) {
        int startPosition = fScannerPosition;
        try {
            if (fSource[start] != '/') {
                // starting tag
                WikiTagNode tagNode = parseTag(start);
                if (tagNode != null && !tagNode.isEmptyXmlTag()) {
                    String tagName = tagNode.getTagName();
                    return readUntilIgnoreCase(fScannerPosition, "");
                }
            }
        } catch (IndexOutOfBoundsException e) {
            // do nothing
        }
        fScannerPosition = startPosition;
        return -1;
    }

    /**
     * Read the characters until the concatenated start and end
     * substring is found. The end substring is matched ignoring case
     * considerations.
     *
     * @param startString
     *          the start string which should be searched in exact case mode
     * @param endString
     *          the end string which should be searched in ignore case mode
     * @return
     */
    protected final int readUntilIgnoreCase(int start, String startString, String endString) {
        int index = Util.indexOfIgnoreCase(fStringSource, startString, endString, start);
        if (index != (-1)) {
            return index + startString.length() + endString.length();
        }
        return -1;
    }

    /**
     * Read the characters until no more letters are found or the given
     * testChar is found. If testChar was found, return
     * the offset position.
     *
     * @param testCh
     *          the test character
     * @param fromIndex
     *          read from this offset
     * @return -1 if the character could not be found or no more
     *         letter character were found.
     */
    protected int indexOfUntilNoLetter(char testChar, int fromIndex) {
        int index = fromIndex;
        char ch;
        while (index < fSource.length) {
            ch = fSource[index++];
            if (ch == testChar) {
                return index - 1;
            }
            if (Character.isLetter(ch)) {
                if (fSource.length <= index) {
                    return -1;
                }
                continue;
            }
            return -1;
        }
        return -1;
    }
}