All Downloads are FREE. Search and download functionalities are using the official Maven repository.

info.bliki.wiki.filter.WikipediaParser Maven / Gradle / Ivy

The newest version!
package info.bliki.wiki.filter;

import info.bliki.commons.validator.routines.EmailValidator;
import info.bliki.htmlcleaner.ContentToken;
import info.bliki.htmlcleaner.EndTagToken;
import info.bliki.htmlcleaner.TagNode;
import info.bliki.htmlcleaner.TagToken;
import info.bliki.wiki.model.Configuration;
import info.bliki.wiki.model.DefaultEventListener;
import info.bliki.wiki.model.IEventListener;
import info.bliki.wiki.model.ITableOfContent;
import info.bliki.wiki.model.IWikiModel;
import info.bliki.wiki.tags.HTMLBlockTag;
import info.bliki.wiki.tags.HTMLTag;
import info.bliki.wiki.tags.HrTag;
import info.bliki.wiki.tags.PTag;
import info.bliki.wiki.tags.WPBoldItalicTag;
import info.bliki.wiki.tags.WPPreTag;
import info.bliki.wiki.tags.WPTag;
import info.bliki.wiki.tags.util.Attribute;
import info.bliki.wiki.tags.util.IBodyTag;
import info.bliki.wiki.tags.util.INoBodyParsingTag;
import info.bliki.wiki.tags.util.NodeAttribute;
import info.bliki.wiki.tags.util.TagStack;
import info.bliki.wiki.tags.util.WikiTagNode;

import java.util.List;

/**
 * A Wikipedia syntax parser for the second pass in the parsing of a Wikipedia
 * source text.
 *
 * @see TemplateParser for the first pass
 */
public class WikipediaParser extends AbstractWikipediaParser {
    public static final String[] TOC_IDENTIFIERS = { "TOC", "NOTOC", "FORCETOC" };

    private ITableOfContent fTableOfContentTag;
    private int fHeadCounter;

    private boolean fHtmlCodes = true;
    private boolean fNoToC;
    private boolean fRenderTemplate;
    private boolean fForceToC;

    private IEventListener fEventListener;

    public WikipediaParser(String stringSource, boolean renderTemplate) {
        this(stringSource, renderTemplate, null);
    }

    public WikipediaParser(String stringSource, boolean renderTemplate,
            IEventListener wikiListener) {
        super(stringSource);
        fRenderTemplate = renderTemplate;
        if (wikiListener == null) {
            fEventListener = DefaultEventListener.CONST;
        } else {
            fEventListener = wikiListener;
        }
    }

    /**
     * Check the text for a #REDIRECT [[...]] or
     * #redirect [[...]] link
     *
     * @param rawWikiText the wiki text
     * @param wikiModel the wikimodel to use
     * @return non-null if a redirect was found and further parsing
     *         should be cancelled according to the model.
     */
    public static String parseRedirect(String rawWikiText, IWikiModel wikiModel) {
        int redirectStart = -1;
        int redirectEnd = -1;
        for (int i = 0; i < rawWikiText.length(); i++) {
            if (rawWikiText.charAt(i) == '#') {
                if (startsWith(rawWikiText, i + 1, "redirect", true)) {
                    redirectStart = rawWikiText.indexOf("[[", i + 8);
                    if (redirectStart > i + 8) {
                        redirectStart += 2;
                        redirectEnd = rawWikiText.indexOf("]]", redirectStart);
                    }
                }
                break;
            }
            if (Character.isWhitespace(rawWikiText.charAt(i))) {
                continue;
            }
            break;
        }

        if (redirectEnd >= 0) {
            String redirectedLink = rawWikiText.substring(redirectStart, redirectEnd);
            if (wikiModel.appendRedirectLink(redirectedLink)) {
                return redirectedLink;
            }
        }
        return null;
    }

    /**
     * Copy the read ahead content in the resulting HTML text token.
     *
     * @param diff
     *            subtract diff form the current parser position to
     *            get the HTML text token end position.
     */
    private boolean createPreContentToken(final int diff) {
        if (fWhiteStart) {
            try {
                final int count = fCurrentPosition - diff - fWhiteStartPosition;
                if (count > 0) {
                    String rawWikiText = fStringSource.substring(
                            fWhiteStartPosition, fWhiteStartPosition + count);
                    WikipediaPreTagParser.parseRecursive(rawWikiText,
                            fWikiModel);
                    fWhiteStart = false;
                }
                return true;
            } catch (InvalidPreWikiTag ignored) {
            }
        }
        return false;
    }

    private int getNextToken() // throws InvalidInputException
    {
        fWhiteStart = true;
        fWhiteStartPosition = fCurrentPosition;
        try {
            while (true) {
                fCurrentCharacter = fSource[fCurrentPosition++];

                // ---------Identify the next token-------------
                switch (fCurrentCharacter) {
                case '\n':
                    // check at the end of line, if there is open wiki bold or
                    // italic
                    // markup
                    reduceTokenStackBoldItalic();
                    break;
                case '{':
                    // dummy parsing of wikipedia templates for event listeners
                    if (!parseTemplate()) {
                        // wikipedia table handling
                        if (parseTable()) {
                            continue;
                        }
                    }
                    break;
                case '_': // TOC identifiers __NOTOC__, __FORCETOC__ ...
                    if (parseSpecialIdentifiers()) {
                        continue;
                    }
                    break;
                case '=': // wikipedia header ?
                    if (parseSectionHeaders()) {
                        continue;
                    }
                    break;
                case WPList.DL_DD_CHAR: // start of 
list case WPList.DL_DT_CHAR: // start of
list case WPList.OL_CHAR: // start of
    list case WPList.UL_CHAR: // start of
      list if (parseLists()) { continue; } break; // case ':': // if (parseSimpleDefinitionLists()) { // continue; // } // break; // case ';': // if (parseDefinitionLists()) { // continue; // } // break; case '-': // parse ---- as
      if (parseHorizontalRuler()) { continue; } break; case ' ': // pre-formatted text? case '\t': if (parsePreformattedWikiBlock()) { continue; } break; } if (isStartOfLine() && fWikiModel.getRecursionLevel() == 1) { if (isEmptyLine(1)) { if (fWikiModel.stackSize() > 0 && (fWikiModel.peekNode() instanceof PTag)) { // close

      tag: createContentToken(2); fWikiModel .reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); } } else { if (fWikiModel.stackSize() == 0) { addParagraph(); // if (fWikiModel.getRecursionLevel() == 1) { // addParagraph(); // } else { // if (fCurrentPosition > 1) { // addParagraph(); // } // } } else { TagToken tag = fWikiModel.peekNode(); if (tag instanceof WPPreTag) { addPreformattedText(); // } else if (tag instanceof PTag) { // createContentToken(fWhiteStart, // fWhiteStartPosition, 2); // reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); } else { String allowedParents = Configuration.HTML_PARAGRAPH_OPEN .getParents(); if (allowedParents != null) { int index; index = allowedParents.indexOf("|" + tag.getName() + "|"); if (index >= 0) { addParagraph(); } } } } } } // ---------Identify the next token------------- switch (fCurrentCharacter) { case '[': if (parseWikiLink()) { continue; } break; case '\'': if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { if (getNextChar('\'')) { createContentToken(5); return TokenBOLDITALIC; } fCurrentPosition -= 1; fWhiteStart = true; createContentToken(3); return TokenBOLD; } createContentToken(3); return TokenBOLD; } createContentToken(2); return TokenITALIC; } break; case '<': if (fHtmlCodes) { int htmlStartPosition = fCurrentPosition; // HTML tags are allowed try { switch (fStringSource.charAt(fCurrentPosition)) { case '!': // if (parseHTMLCommentTags()) { continue; } break; default: if (fSource[fCurrentPosition] != '/') { // opening HTML tag WikiTagNode tagNode = parseTag(fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap() .get(tagName); if (tag != null) { tag = (TagToken) tag.clone(); if (tag instanceof TagNode) { TagNode node = (TagNode) tag; List attributes = tagNode .getAttributesEx(); Attribute attr; String temp; for (int i = 1; i < attributes .size(); i++) { attr = attributes.get(i); temp = attr.getValue(); if (temp != null) { temp = parseNowiki(temp); } node.addAttribute( attr.getName(), temp, true); } } if (tag instanceof HTMLTag) { ((HTMLTag) tag) .setTemplate(isTemplate()); } createContentToken(1); fCurrentPosition = fScannerPosition; String allowedParents = tag .getParents(); if (allowedParents != null) { fWikiModel .reduceTokenStack(tag); } createTag(tag, tagNode, tagNode.getEndPosition()); return TokenIgnore; } else { fCurrentPosition = tagNode.getEndPosition(); // fWhiteStart = true; // skipUntilEndOfTag(tagNode, // tagNode.getEndPosition()); // createContentToken(0); // return TokenIgnore; } // break; } } else { // closing HTML tag WikiTagNode tagNode = parseTag(++fCurrentPosition); if (tagNode != null) { String tagName = tagNode.getTagName(); TagToken tag = fWikiModel.getTokenMap() .get(tagName); if (tag != null) { createContentToken(2); fCurrentPosition = fScannerPosition; if (fWikiModel.stackSize() > 0) { TagToken topToken = fWikiModel .peekNode(); if (topToken.getName().equals( tag.getName())) { fWikiModel.popNode(); return TokenIgnore; } else { if (tag.isReduceTokenStack()) { reduceStackUntilToken(tag); } } } return TokenIgnore; } break; } } } } catch (IndexOutOfBoundsException e) { // do nothing } fCurrentPosition = htmlStartPosition; } break; default: if (Character.isLetter(fCurrentCharacter)) { if (fCurrentPosition < 2 || !Character .isLetterOrDigit(fSource[fCurrentPosition - 2])) { if (fCurrentCharacter == 'i' || fCurrentCharacter == 'I') { // ISBN ? if (parseISBNLinks()) { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; continue; } } if (parseURIScheme()) { // a URI scheme registered in the wiki model // (ftp, http, // https,...) fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; continue; } if (fWikiModel.isCamelCaseEnabled() && Character.isUpperCase(fCurrentCharacter) && fWikiModel.getRecursionLevel() <= 1) { if (parseCamelCaseLink()) { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; continue; } } } } } if (!fWhiteStart) { fWhiteStart = true; fWhiteStartPosition = fCurrentPosition - 1; } } // -----------------end switch while try-------------------- } catch (IndexOutOfBoundsException e) { // end of scanner text } try { createContentToken(1); } catch (IndexOutOfBoundsException e) { // end of scanner text } return TokenEOF; } private String parseNowiki(String input) { int indx = input.indexOf(""); if (indx >= 0) { int indx2; int lastIndx = 0; StringBuilder buf = new StringBuilder(input.length()); while (indx >= 0) { buf.append(input.substring(lastIndx, indx)); lastIndx = indx + 8; // length indx2 = input.indexOf("", indx + 1); if (indx2 >= 0) { buf.append(input.substring(lastIndx, indx2)); lastIndx = indx2 + 9;// length } else { break; } indx = input.indexOf("", indx2 + 1); } buf.append(input.substring(lastIndx, input.length())); return buf.toString(); } return input; } private void addParagraph() { createContentToken(2); fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); fWikiModel.pushNode(new PTag()); } /** * Add the content of the wiki <pre> block. Trim the content at the * right side. */ private void addPreformattedText() { if (fWhiteStart) { int currentPos = fCurrentPosition; int whiteEndPosition = fCurrentPosition - 2; while (whiteEndPosition > fWhiteStartPosition) { if (!Character.isWhitespace(fSource[whiteEndPosition])) { whiteEndPosition++; break; } whiteEndPosition--; } try { fCurrentPosition = whiteEndPosition; createContentToken(0); } finally { fCurrentPosition = currentPos; } } fWikiModel.reduceTokenStack(Configuration.HTML_PARAGRAPH_OPEN); fWikiModel.pushNode(new PTag()); } private boolean parseISBNLinks() { final int urlStartPosition = fCurrentPosition; boolean foundISBN = false; try { if ((fCurrentCharacter == 'i' || fCurrentCharacter == 'I') && (fSource[fCurrentPosition] == 's' || fSource[fCurrentPosition] == 'S') && (fSource[++fCurrentPosition] == 'b' || fSource[fCurrentPosition] == 'B') && (fSource[++fCurrentPosition] == 'n' || fSource[fCurrentPosition] == 'N') && fSource[++fCurrentPosition] == ' ') { fCurrentPosition++; createContentToken(5); foundISBN = true; char ch; ch = fSource[fCurrentPosition++]; while ((ch >= '0' && ch <= '9') || ch == '-') { ch = fSource[fCurrentPosition++]; } } } catch (IndexOutOfBoundsException ignored) { } if (foundISBN) { String urlString = fStringSource.substring(urlStartPosition - 1, fCurrentPosition - 1); fCurrentPosition--; fWikiModel.appendISBNLink(urlString); return true; } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } private boolean parseMailtoLinks() { final int urlStartPosition = fCurrentPosition; int tempPosition = fCurrentPosition; boolean foundUrl = false; try { if ((fCurrentCharacter == 'm' || fCurrentCharacter == 'M') && (fSource[fCurrentPosition] == 'a' || fSource[fCurrentPosition] == 'A') && (fSource[++fCurrentPosition] == 'i' || fSource[fCurrentPosition] == 'I') && (fSource[++fCurrentPosition] == 'l' || fSource[fCurrentPosition] == 'L') && (fSource[++fCurrentPosition] == 't' || fSource[fCurrentPosition] == 'T') && (fSource[++fCurrentPosition] == 'o' || fSource[fCurrentPosition] == 'O') && fSource[fCurrentPosition + 1] == ':') { tempPosition += 6; fCurrentCharacter = fSource[tempPosition++]; foundUrl = true; while (!Character.isWhitespace(fSource[tempPosition++])) { } } } catch (IndexOutOfBoundsException ignored) { } if (foundUrl) { String urlString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1); String email = urlString.substring(7); if (EmailValidator.getInstance().isValid(email)) { createContentToken(5); fWhiteStart = false; fCurrentPosition = tempPosition; fCurrentPosition--; fWikiModel.appendMailtoLink(urlString, urlString, true); return true; } } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } /** * See URI scheme * * @return true if a registered URI scheme was found in the * wiki models configuration.. */ private boolean parseURIScheme() { if (fCurrentCharacter == 'm' || fCurrentCharacter == 'M') { // mailto ? if (parseMailtoLinks()) { return true; } } int urlStartPosition = fCurrentPosition; int tempPosition = fCurrentPosition; String uriSchemeName = ""; int index = -1; boolean foundUrl = false; try { index = indexOfUntilNoLetter(':', fCurrentPosition); if (index > 0) { uriSchemeName = fStringSource.substring(fCurrentPosition - 1, index); if (fWikiModel.isValidUriScheme(uriSchemeName)) { // found something like "ftp", "http", "https" tempPosition += uriSchemeName.length() + 1; fCurrentCharacter = fSource[tempPosition++]; createContentToken(1); fWhiteStart = false; foundUrl = true; while (Encoder.isUrlIdentifierPart(fSource[tempPosition++])) { } } } } catch (IndexOutOfBoundsException ignored) { } if (foundUrl) { // separators at the end must be removed - maybe more chars? final String separators = ".!;?:,"; while (tempPosition > 1 && tempPosition > urlStartPosition && (separators.indexOf(fSource[tempPosition - 2]) != (-1))) { --tempPosition; } String restString = fStringSource.substring(urlStartPosition - 1, tempPosition - 1); String uriSchemeSpecificPart = fStringSource.substring(index + 1, tempPosition - 1); if (fWikiModel.isValidUriSchemeSpecificPart(uriSchemeName, uriSchemeSpecificPart)) { fWhiteStart = false; fCurrentPosition = tempPosition; fCurrentPosition--; fWikiModel.appendExternalLink(uriSchemeName, restString, restString, true); return true; } } // rollback work :-) fCurrentPosition = urlStartPosition; return false; } private boolean parseCamelCaseLink() { int startLinkPosition = fCurrentPosition - 1; int temp = fCurrentPosition; boolean isCamelCase = false; try { char ch = fSource[temp++]; while (Character.isLetterOrDigit(ch)) { if (Character.isUpperCase(ch)) { // at least 2 upper case characters appear in the word isCamelCase = true; } ch = fSource[temp++]; } } catch (IndexOutOfBoundsException ignored) { } if (isCamelCase) { createContentToken(1); fWhiteStart = false; fCurrentPosition = temp - 1; String name = fStringSource.substring(startLinkPosition, fCurrentPosition); fWikiModel.appendInternalLink(name, null, name, null, false); return true; } return false; } /** * Parse a wiki section starting with a '[' character * * @return true if a correct link was found */ private boolean parseWikiLink() { int startLinkPosition = fCurrentPosition; if (getNextChar('[')) { return parseWikiTag(); } else if (getNextCharAsWhitespace()) { fCurrentPosition--; return false; } else { createContentToken(1); fWhiteStart = false; if (readUntilCharOrStopAtEOL(']')) { String name = fStringSource.substring(startLinkPosition, fCurrentPosition - 1); if (handleHTTPLink(name)) { return true; } } fCurrentPosition = startLinkPosition; } return false; } /** * Parse a wiki section starting with a '[[' sequence * * @return true if a correct link was found */ private boolean parseWikiTag() { int startLinkPosition = fCurrentPosition; int endLinkPosition; // wikipedia link style createContentToken(2); int temp = fCurrentPosition; if (findWikiLinkEnd()) { endLinkPosition = fCurrentPosition - 2; final String name = fStringSource.substring(startLinkPosition, endLinkPosition); // test for a suffix string behind the Wiki link. Useful for plurals. // Example: Dolphins are [[aquatic mammal]]s that are closely related to [[whale]]s and [[porpoise]]s. temp = fCurrentPosition; String suffix = ""; try { fCurrentCharacter = fSource[fCurrentPosition]; if (Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition++; StringBuilder suffixBuffer = new StringBuilder(16); suffixBuffer.append(fCurrentCharacter); while (true) { fCurrentCharacter = fSource[fCurrentPosition++]; if (!Character.isLowerCase(fCurrentCharacter)) { fCurrentPosition--; break; } suffixBuffer.append(fCurrentCharacter); } suffix = suffixBuffer.toString(); } } catch (IndexOutOfBoundsException e) { fCurrentPosition = temp; } fEventListener.onWikiLink(fSource, startLinkPosition, endLinkPosition, suffix); if (!fWikiModel.appendRawWikipediaLink(name, suffix)) { fCurrentPosition = temp; } return true; } else { fWhiteStart = true; fWhiteStartPosition = startLinkPosition - 2; fCurrentPosition = temp + 1; } return false; } private boolean parsePreformattedWikiBlock() { if (isStartOfLine() && !isEmptyLine(1)) { if (fWikiModel.stackSize() == 0 || !(fWikiModel.peekNode() instanceof HTMLBlockTag) || (fWikiModel.peekNode() instanceof PTag)) { createContentToken(2); fWikiModel.reduceTokenStack(Configuration.HTML_PRE_OPEN); // don't use Configuration.HTML_PRE_OPEN here // rendering differs between these tags! fWikiModel.pushNode(new WPPreTag()); char ch = ' '; try { while (ch == ' ' || ch == '\t') { // SPACE or TAB => check if it's a pre-formatted text fWhiteStart = true; fWhiteStartPosition = fCurrentPosition; ch = fSource[fCurrentPosition++]; while (ch != '\n' && fCurrentPosition < fSource.length) { ch = fSource[fCurrentPosition++]; } if (fCurrentPosition == fSource.length) { // scanner reached end of text if (!createPreContentToken(0)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } } else { ch = fSource[fCurrentPosition++]; if (ch == ' ' || ch == '\t') { if (!createPreContentToken(1)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } } else { // skip the newline character at the end of the // pre-formatted // block if (!createPreContentToken(2)) { fCurrentPosition = fWhiteStartPosition; fSource[fWhiteStartPosition - 1] = '\n'; return false; } else { fCurrentPosition--; return true; } } } } } catch (IndexOutOfBoundsException e) { fCurrentPosition--; } finally { fWikiModel.popNode(); } } return true; } return false; } /** * Parse ---- as <hr> tag */ private boolean parseHorizontalRuler() { if (isStartOfLine()) { int tempCurrPosition = fCurrentPosition; try { if (fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-' && fSource[tempCurrPosition++] == '-') { int pos = isEndOfLine('-', tempCurrPosition); if (pos > 0) { HrTag hr = new HrTag(); createContentToken(2); fWikiModel.reduceTokenStack(hr); fCurrentPosition = pos; fWikiModel.append(hr); fWhiteStart = false; return true; } } } catch (IndexOutOfBoundsException ignored) { } } return false; } /** * Parse a wiki list
      *
      * Example:
      * *

           * * first line
           * * second line
           * * third line
           * 
      */ private boolean parseLists() { // set scanner pointer to '\n' character: if (isStartOfLine()) { setPosition(fCurrentPosition - 2); WPList list = wpList(); if (list != null && !list.isEmpty()) { createContentToken(1); fWikiModel.reduceTokenStack(list); fCurrentPosition = getPosition() - 1; fWikiModel.append(list); return true; } } return false; } /** * Parses a wiki header line into "h1, h2, h3, h4, h5, h6" HTML * tags.
      *
      * Example wiki syntax header line:
      * == Test header 2 == * * @return true if a header line could be parsed correctly, * false otherwise. */ private boolean parseSectionHeaders() { if (isStartOfLine()) { int headerStartPosition = fCurrentPosition - 1; int endIndex = fStringSource.indexOf("\n", fCurrentPosition); if (endIndex < 0) { endIndex = fStringSource.length(); } int headerEndPosition = endIndex; char ch; while (headerEndPosition > 0) { ch = fSource[--headerEndPosition]; if (!Character.isWhitespace(ch)) { break; } } if (headerEndPosition < 0 || headerEndPosition <= headerStartPosition) { return false; } int level = 0; int startPosition = headerStartPosition; int endPosition = headerEndPosition + 1; while (headerStartPosition < headerEndPosition) { if (fSource[headerStartPosition] == '=' && fSource[headerEndPosition] == '=') { level++; headerStartPosition++; headerEndPosition--; } else { headerEndPosition++; break; } } if (level == 0) { return false; } if (level > 6) { level = 6; } createContentToken(1); reduceTokenStack(); String head = ""; if (headerEndPosition >= headerStartPosition) { if (headerEndPosition > headerStartPosition) { head = fStringSource.substring(headerStartPosition, headerEndPosition); } else { head = String.valueOf(fStringSource .charAt(headerStartPosition)); } } fEventListener.onHeader(fSource, startPosition, endPosition, headerStartPosition, headerEndPosition, level); fCurrentPosition = endIndex; fTableOfContentTag = fWikiModel.appendHead(head, level, fNoToC, ++fHeadCounter, startPosition, endPosition); return true; } return false; } private boolean parseTable() { if (isStartOfLine()) { // wiki table ? setPosition(fCurrentPosition - 1); WPTable table = wpTable(fTableOfContentTag); if (table != null) { createContentToken(1); fWikiModel.reduceTokenStack(table); // set pointer behind: "\n|}" fCurrentPosition = getPosition(); fWikiModel.append(table); // table.filter(fSource, fWikiModel); return true; } } return false; } private boolean parseTemplate() { try { // dummy parsing of Wikipedia templates for event listeners // doesn't change fCurrentPosition if (fSource[fCurrentPosition] == '{') { int templateStartPosition = fCurrentPosition + 1; if (fSource[templateStartPosition] != '{') { int templateEndPosition = findNestedTemplateEnd(fSource, templateStartPosition); if (templateEndPosition > 0) { fEventListener.onTemplate(fSource, templateStartPosition, templateEndPosition - 2); return true; } } } } catch (Exception ignored) { } return false; } /** * Parse special identifiers like __TOC__, __NOTOC__, __FORCETOC__ */ private boolean parseSpecialIdentifiers() { if (fSource.length > fCurrentPosition && fSource[fCurrentPosition] == '_') { int oldPosition = fCurrentPosition; try { fCurrentPosition++; int tocEndPosition = fCurrentPosition; char ch; while (true) { ch = fSource[tocEndPosition++]; if (ch >= 'A' && ch <= 'Z') { continue; } break; } if (ch == '_' && fSource[tocEndPosition] == '_') { String tocIdent = fStringSource.substring(fCurrentPosition, tocEndPosition - 1); if (fWikiModel.parseBehaviorSwitch(tocIdent)) { createContentToken(2); fCurrentPosition = tocEndPosition + 1; return true; } boolean tocRecognized = false; for (int i = 0; i < TOC_IDENTIFIERS.length; i++) { if (TOC_IDENTIFIERS[i].equals(tocIdent)) { createContentToken(2); tocRecognized = true; fCurrentPosition = tocEndPosition + 1; switch (i) { case 0: // TOC fTableOfContentTag = fWikiModel .createTableOfContent(true); fForceToC = true; break; case 1: // NOTOC setNoToC(true); break; case 2: // FORCETOC fForceToC = true; break; } break; } } if (tocRecognized) { return true; } } } catch (IndexOutOfBoundsException e) { // end of scanner text } fCurrentPosition = oldPosition; } return false; } /** * Check if the scanners cursor position is at the beginning of a line. * * @return true if the scanners cursor points to the beginning * of a line, false otherwise. */ private boolean isStartOfLine() { if (fCurrentPosition >= 2) { if (fSource[fCurrentPosition - 2] == '\n') { return true; } } else if (fCurrentPosition == 1) { return true; } return false; } private int isEndOfLine(char testChar, int currentPosition) { int tempPosition = currentPosition; try { char ch; while (true) { ch = fSource[tempPosition]; if (ch != testChar) { break; } tempPosition++; } while (true) { ch = fSource[tempPosition++]; if (ch == '\n') { return tempPosition; } else if (!Character.isWhitespace(ch)) { return -1; } } } catch (IndexOutOfBoundsException ignored) { } return -1; } private void createTag(TagToken tag, WikiTagNode tagNode, int startMacroPosition) { String endTag; String macroBodyString; int index0; String command = tagNode.getTagName(); if ((tag != null) && (tag instanceof IBodyTag) && (!tagNode.isEmptyXmlTag())) { endTag = command + '>'; index0 = Util.indexOfIgnoreCase(fStringSource, "= 0) { macroBodyString = fStringSource.substring(startMacroPosition, index0); fCurrentPosition = index0 + endTag.length() + 2; } else { macroBodyString = fStringSource.substring(startMacroPosition, fSource.length); fCurrentPosition = fSource.length; } } else { macroBodyString = null; fCurrentPosition = startMacroPosition; } handleTag(tag, tagNode, macroBodyString); } private boolean handleHTTPLink(String name) { String urlString; String uriSchemeName = ""; if (name != null) { boolean isEmail = false; int index = -1; boolean foundUrl = false; boolean protocolRelativeURL = false; urlString = name.trim(); if (urlString.length() >= 2 && urlString.charAt(0) == '/' && urlString.charAt(1) == '/') { // issue 89 foundUrl = true; protocolRelativeURL = true; } else { try { index = urlString.indexOf(':', 1); if (index > 0) { uriSchemeName = urlString.substring(0, index); if (uriSchemeName.equalsIgnoreCase("mailto")) { isEmail = true; foundUrl = true; } else { if (fWikiModel.isValidUriScheme(uriSchemeName)) { foundUrl = true; } } } } catch (IndexOutOfBoundsException ignored) { } } if (foundUrl) { // Wikipedia link style: name separated by invalid URL character? // see test: "open square bracket forbidden in URL (named) (bug 4377)" int pipeIndex = 0; while (pipeIndex < urlString.length() && (Encoder.isUrlIdentifierPart(urlString.charAt(pipeIndex)) || urlString.charAt(pipeIndex) == '\'')) { ++pipeIndex; } String alias; if (pipeIndex < urlString.length()) { if (urlString.charAt(pipeIndex) == ' ') { alias = urlString.substring(pipeIndex + 1); } else { alias = urlString.substring(pipeIndex); } urlString = urlString.substring(0, pipeIndex); } else { if (protocolRelativeURL) { alias = urlString.substring(2); } else { alias = urlString; } } if (isEmail) { String email; if (pipeIndex > 7) { email = urlString.substring(7, pipeIndex); } else { email = urlString.substring(7); } if (EmailValidator.getInstance().isValid(email)) { fWikiModel.appendMailtoLink(urlString, alias, false); return true; } } else { if (protocolRelativeURL) { fWikiModel.appendExternalLink(uriSchemeName, urlString, alias, false); return true; } parseURIScheme(); String uriSchemeSpecificPart = urlString .substring(index + 1); if (fWikiModel.isValidUriSchemeSpecificPart(uriSchemeName, uriSchemeSpecificPart)) { fWikiModel.appendExternalLink(uriSchemeName, urlString, alias, false); return true; } } } } return false; } private void handleTag(TagToken tag, WikiTagNode tagNode, String bodyString) { String command = tagNode.getTagName(); try { if (tag instanceof EndTagToken) { fWikiModel.append(tag); } else { fWikiModel.pushNode(tag); if (null != bodyString) { if (tag instanceof INoBodyParsingTag) { ((TagNode) tag).addChild(new ContentToken(bodyString)); } else { // recursively filter tags within the tags body string WikipediaParser.parseRecursive(bodyString.trim(), fWikiModel, false, true); } } if (tag instanceof IBodyTag) { fWikiModel.popNode(); } } } catch (IllegalArgumentException e) { TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken("IllegalArgumentException: " + command + " - " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } catch (Throwable e) { e.printStackTrace(); TagNode divTagNode = new TagNode("div"); divTagNode.addAttribute("class", "error", true); divTagNode.addChild(new ContentToken(command + ": " + e.getMessage())); fWikiModel.append(divTagNode); e.printStackTrace(); } } public void runParser() { int token; while ((token = getNextToken()) != TokenEOF) { switch (token) { case TokenBOLDITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(BOLD) && fWikiModel.getNode(fWikiModel.stackSize() - 2) .equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 1 && fWikiModel.peekNode().equals(ITALIC) && fWikiModel.getNode(fWikiModel.stackSize() - 2) .equals(BOLD)) { fWikiModel.popNode(); fWikiModel.popNode(); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); } else { fWikiModel.pushNode(new WPBoldItalicTag()); } break; case TokenBOLD: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("i")); // fResultBuffer.append(""); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLD)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("b")); } break; case TokenITALIC: if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(BOLDITALIC)) { fWikiModel.popNode(); fWikiModel.pushNode(new WPTag("b")); } else if (fWikiModel.stackSize() > 0 && fWikiModel.peekNode().equals(ITALIC)) { fWikiModel.popNode(); } else { fWikiModel.pushNode(new WPTag("i")); } break; } } reduceTokenStack(); if (!fNoToC && fTableOfContentTag != null) { if (fHeadCounter > 3 || fForceToC) { fTableOfContentTag.setShowToC(true); } } } @Override public void setNoToC(boolean noToC) { fNoToC = noToC; } /** * Call the parser on the first recursion level, where the text can contain * a table of contents (TOC). * *
      *
      * Note: in this level the wiki model will call the * setUp() method before parsing and the * tearDown() method after the parser has finished. * * @param rawWikiText * the raw text of the article * @param wikiModel * a suitable wiki model for the given wiki article text * @param parseTemplates * parse the template expansion step * @param templateParserBuffer * if the templateParserBuffer != null the * templateParserBuffer will be used to append the * result of the template expansion step * */ public static void parse(String rawWikiText, IWikiModel wikiModel, boolean parseTemplates, Appendable templateParserBuffer) { try { // initialize the wiki model wikiModel.setUp(); if (parseTemplates) { Appendable buf; if (templateParserBuffer != null) { buf = templateParserBuffer; } else { buf = new StringBuilder(rawWikiText.length() + rawWikiText.length() / 10); } String templatesParsedText = rawWikiText; try { // TemplateParser.parse(templatesParsedText, wikiModel, buf, // wikiModel.isTemplateTopic()); TemplateParser.parseRecursive(templatesParsedText, wikiModel, buf, false, wikiModel.isTemplateTopic(), null); templatesParsedText = buf.toString(); } catch (Exception ioe) { ioe.printStackTrace(); templatesParsedText = "TemplateParser exception: " + ioe.getClass().getSimpleName() + ""; } String redirectedLink = parseRedirect( templatesParsedText, wikiModel); if (redirectedLink == null) { parseRecursive(templatesParsedText, wikiModel, false, false); } } else { if (parseRedirect(rawWikiText, wikiModel) == null) { parseRecursive(rawWikiText, wikiModel, false, false); } } } finally { // clean up wiki model if necessary wikiModel.tearDown(); } } /** * Call the parser on the subsequent recursion levels, where the subtexts * (of templates, table cells, list items or image captions) don't contain a * table of contents (TOC) * * Note: the wiki model doesn't call the setUp() or * tearDown() methods for the subsequent recursive parser * steps. * * @return HTML tags from the parsing process */ public static TagStack parseRecursive(String rawWikitext, IWikiModel wikiModel, boolean createOnlyLocalStack, boolean noTOC) { AbstractWikipediaParser parser = wikiModel.createNewInstance(rawWikitext); return parser.parseRecursiveInternal(wikiModel, createOnlyLocalStack, noTOC); } /** * Determine if the currently parsed wiki text is a template text. * * @return true if the currently parsed wiki text is a template */ private boolean isTemplate() { return fRenderTemplate; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy