com.sangupta.nutz.TextNodeParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nutz Show documentation
Show all versions of nutz Show documentation
Markdown processor for the JVM
The newest version!
/**
*
* nutz - Markdown processor for JVM
* Copyright (c) 2012, Sandeep Gupta
*
* http://www.sangupta/projects/nutz
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.sangupta.nutz;
import java.util.Arrays;
import com.sangupta.nutz.ast.AnchorNode;
import com.sangupta.nutz.ast.EmailNode;
import com.sangupta.nutz.ast.EmphasisNode;
import com.sangupta.nutz.ast.HtmlCommentNode;
import com.sangupta.nutz.ast.ImageNode;
import com.sangupta.nutz.ast.InlineCodeNode;
import com.sangupta.nutz.ast.NewLineNode;
import com.sangupta.nutz.ast.Node;
import com.sangupta.nutz.ast.ParagraphNode;
import com.sangupta.nutz.ast.PlainTextNode;
import com.sangupta.nutz.ast.SpecialCharacterNode;
import com.sangupta.nutz.ast.StrongNode;
import com.sangupta.nutz.ast.TextNode;
import com.sangupta.nutz.ast.UnreferencedAnchorNode;
import com.sangupta.nutz.ast.XmlNode;
/**
* Class that parses given text taking care of intricacies in line like text-styles, embedded
* images, anchors etc.
*
* @author sangupta
* @since 0.4
*/
public class TextNodeParser implements Identifiers {
private String line;
private int lastConverted;
private int pos;
private int length;
private TextNode root = null;
/**
* Parse the given markup line and convert it into
* many text nodes.
*
* @param line
* @return
*/
public TextNode parse(Node parent, String line) {
return parse(parent, new ParagraphNode(parent), line);
}
public TextNode parse(Node parent, TextNode rootNode, String line) {
this.line = line;
this.length = line.length();
this.pos = 0;
this.lastConverted = 0;
this.root = rootNode;
// parse into tokens
parse();
this.root = null;
return rootNode;
}
/**
* Recursively parse the entire string and convert it into various text nodes
*
*/
private void parse() {
do {
if(charAt(pos, NEW_LINE)) {
clearPending();
root.addChild(new NewLineNode(root));
pos++;
lastConverted = pos;
continue;
}
if(charAt(pos, ESCAPE_CHARACTER)) {
clearPending();
pos++;
lastConverted++;
}
if(charAt(pos, HTML_OR_AUTOLINK_START)) {
clearPending();
if(charAt(pos + 1, EXCLAIMATION) && charAt(pos + 2, HYPHEN) && charAt(pos + 3, HYPHEN)) {
parseInlineHTMLComment();
} else {
parseHtmlOrAutoLinkBlock();
}
}
if(charAt(pos, CODE_MARKER) && !charAt(pos - 1, ESCAPE_CHARACTER)) {
clearPending();
parseCharacterBlock(CODE_MARKER);
}
if(charAt(pos, EXCLAIMATION) && charAt(pos + 1, LINK_START)) {
clearPending();
parseImageBlock();
}
if(charAt(pos, LINK_START) && !charAt(pos-1, ESCAPE_CHARACTER)) {
clearPending();
parseLink();
}
if(charAt(pos, ITALIC_OR_BOLD) && !charAt(pos - 1, ESCAPE_CHARACTER)) {
clearPending();
parseCharacterBlock(ITALIC_OR_BOLD);
}
if(charAt(pos, ITALIC_OR_BOLD_UNDERSCORE) && !charAt(pos - 1, ESCAPE_CHARACTER)) {
clearPending();
parseCharacterBlock(ITALIC_OR_BOLD_UNDERSCORE);
}
handleSpecialCharacters();
pos++;
} while(pos < line.length());
clearPending();
}
/**
* Handle conversion of special characters that need to be escaped in HTML.
* This is needed to make sure that we write the correct set of letters
* and also to keep the speed intact.
*/
private void handleSpecialCharacters() {
if(this.pos >= this.length) {
return;
}
final char character = line.charAt(pos);
boolean conversion = false;
switch(character) {
case AMPERSAND:
if(line.length() < pos + 5) {
break;
}
if(!(line.substring(pos + 1, pos + 5).equals("amp;"))) {
conversion = true;
}
break;
case HTML_OR_AUTOLINK_START:
case HTML_OR_AUTOLINK_END:
conversion = true;
break;
}
if(conversion) {
// remove the current character and replace
clearPending();
root.addChild(new SpecialCharacterNode(root, character));
pos++;
lastConverted = pos;
}
}
private void parseInlineHTMLComment() {
int index = pos + 3;
index = line.indexOf("-->", index);
if(index == -1) {
return;
}
String text = line.substring(pos, index + 3);
root.addChild(new HtmlCommentNode(text));
// reset
pos = index + 3;
lastConverted = pos;
}
/**
* Parse a block of autolinking URL or an email address. If not,
* then see if this is an HTML block. If yes, escape till the end
* of this HTML block so that we can continue working with
* markdown markup.
*/
private void parseHtmlOrAutoLinkBlock() {
int index = line.indexOf(HTML_OR_AUTOLINK_END, pos + 1);
if(index == -1) {
// nothing more to do
return;
}
String markup = line.substring(pos + 1, index);
// hyperlink
if(MarkupUtils.isHyperLink(markup)) {
root.addChild(new AnchorNode(root, markup));
// reset
pos = index + 1;
lastConverted = pos;
return;
}
// email
if(MarkupUtils.isEmail(markup)) {
root.addChild(new EmailNode(markup));
// reset
pos = index + 1;
lastConverted = pos;
return;
}
// check if this is an XML
// extract the tag name from the line
String tagName = extractTagName(markup);
index = MarkupUtils.findEndingTagPosition(line, pos + tagName.length(), tagName);
if(index == -1) {
if(MarkupUtils.isSingularHtmlElement(tagName)) {
index = line.indexOf('>', pos + tagName.length()) + 1;
} else {
// no idea what this is
// ignore
return;
}
}
// add an XML block
root.addChild(new XmlNode(line.substring(pos, index)));
// reset
pos = index;
lastConverted = pos;
}
/**
* Extract the tagname from the given HTML markup.
*
* @param markup
* @return
*/
private String extractTagName(String markup) {
int index = markup.indexOf(SPACE);
if(index != -1) {
markup = markup.substring(0, index);
}
if(markup.endsWith("/")) {
markup = markup.substring(0, markup.length() - 1);
}
return markup;
}
/**
* Parse and create an image block out of the code just found.
*
*/
private void parseImageBlock() {
int index = line.indexOf(LINK_END, pos + 2); // 2 because we have just matched two characters
if(index == -1) {
// not an image
return;
}
String alternateText = line.substring(pos + 2, index);
pos = ++index;
// either a URL would be specified
// or a reference to another hyperlink
// would be provided
char ch = line.charAt(index++);
while(ch == SPACE) {
ch = line.charAt(index++);
}
if(ch != HREF_START && ch != LINK_START) {
// this is not a hyperlink
// just plainly exit the loop
return;
}
if(ch == HREF_START) {
// we do allow empty URLs - this is a test in Daring Fireball's test suite
index = MarkupUtils.indexOfSkippingForPairCharacter(line, HREF_END, HREF_START, index);
if(index == -1) {
return;
}
String link = line.substring(pos + 1, index);
// break this link into link and title
String[] tokens = MarkupUtils.parseLinkAndTitle(link);
// create the node
root.addChild(new ImageNode(tokens[0], alternateText, tokens[1]));
} else if(ch == LINK_START) {
// this is the text
index = line.indexOf(LINK_END, index + 1);
if(index == -1) {
// not a reference link
return;
}
// extract the identifier
String identifier = line.substring(pos + 1, index);
// create the node
root.addChild(new ImageNode(identifier, alternateText, true));
}
// reset
pos = index + 1;
lastConverted = pos;
}
/**
* Parse a hyperlink that may have been specified. A hyperlink
* may have more markup inside.
*
*/
private void parseLink() {
int index = MarkupUtils.indexOfSkippingForPairCharacter(line, LINK_END, LINK_START, pos + 1); // line.indexOf(LINK_END, pos + 1);
if(index == -1) {
// this is not a hyperlink
return;
}
// this is the text
final String linkText = line.substring(pos + 1, index);
pos = ++index;
// either a URL would be specified
// or a reference to another hyperlink
// would be provided
if(index >= line.length()) {
// this is the end of line
// means - this was an unreferenced anchor link
root.addChild(new UnreferencedAnchorNode(linkText));
// reset
pos = index;
lastConverted = pos;
// exit
return;
}
char ch = line.charAt(index++);
int spaceCount = 0;
while(ch == SPACE || ch == '\n') {
if(index >= line.length()) {
break;
}
ch = line.charAt(index++);
spaceCount++;
}
if(ch != HREF_START && ch != LINK_START) {
// this is not a hyperlink
// but it may be a candidate for the unreferenced
// anchor node - create a node and exit
root.addChild(new UnreferencedAnchorNode(linkText));
// reset
pos = index - 1 - spaceCount;
lastConverted = pos;
// exit
return;
}
// move ahead in position
pos += spaceCount;
if(ch == HREF_START) {
// extract the URL
index = MarkupUtils.indexOfSkippingForPairCharacter(line, HREF_END, HREF_START, index);
if(index == -1) {
// not a hyperlink
return;
}
// extract the actual URL
String link = line.substring(pos + 1, index);
// see if we have some title in it or not
String[] tokens = MarkupUtils.parseLinkAndTitle(link);
// create the node
AnchorNode anchorNode = new AnchorNode(root, linkText.trim(), tokens[0].trim(), tokens[1], false, spaceCount);
root.addChild(anchorNode);
} else if(ch == LINK_START) {
// this is the text
index = line.indexOf(LINK_END, index);
if(index == -1) {
// not a reference link
return;
}
// extract the identifier
String identifier = line.substring(pos + 1, index);
AnchorNode anchorNode = new AnchorNode(root, linkText.trim(), identifier, null, true, spaceCount);
root.addChild(anchorNode);
}
// final settlement
pos = index + 1;
lastConverted = pos;
}
/**
* Parse the given character block.
*
* @param terminator
*/
private void parseCharacterBlock(final char terminator) {
int count = 1;
// count the total number of terminators available together
int index = 1;
do {
char c = line.charAt(pos + index);
if(c == terminator) {
index++;
count++;
} else {
break;
}
} while(true);
// now we need to find the total number of terminators after a non-terminator
// character
index = pos + count;
// first let's see if we have a string of count characters
// of the terminator available
// if yes, we need to use that than finding and counting
int endCount = 0;
int indexMultiple = MarkupUtils.indexOfMultiple(line, terminator, count, index);
if(indexMultiple != -1) {
index = indexMultiple;
endCount = count;
} else {
// instead of an indexOf - we need to go character by character
// this is because there may be escaping characters before the terminator
// and an escaper may be escaped itself
char c;
do {
c = line.charAt(index);
if(c == ESCAPE_CHARACTER) {
index++;
} else if(c == terminator) {
break;
}
index++;
if(index >= this.length) {
index = -1;
break;
}
} while(true);
if(index == -1) {
// none available
// we need to break only the available ones
// into nodes
convertCharacterBlock(count, terminator);
// reset
pos = pos + count;
lastConverted = pos;
return;
}
// this means that we found out another set of terminators
// let's find the total number of counts available here
endCount = 1;
int checkIndex = 0;
do {
checkIndex = index + endCount;
if(checkIndex == this.length) {
break;
}
if(line.charAt(checkIndex) == terminator) {
endCount++;
} else {
break;
}
} while(true);
}
String text = line.substring(pos + count, index);
// now we have the text that is between these starting and ending
// terminator string. convert this to markup
convertCharacterBlock(count, endCount, text, terminator);
// reset
pos = index + endCount;
lastConverted = pos;
}
private void convertCharacterBlock(int startCount, int endCount, String text, final char terminator) {
switch(terminator) {
case ITALIC_OR_BOLD:
case ITALIC_OR_BOLD_UNDERSCORE:
if(startCount == endCount) {
switch(startCount) {
case 1:
// create an italic text node
root.addChild(new EmphasisNode(root, text));
return;
case 2:
// create a strong node
root.addChild(new StrongNode(root, text));
return;
case 3:
StrongNode strongNode = new StrongNode(root);
EmphasisNode emphasisNode = new EmphasisNode(strongNode, text);
strongNode.setTextNode(emphasisNode);
root.addChild(strongNode);
return;
}
}
break;
case CODE_MARKER:
// create the node
root.addChild(new InlineCodeNode(root, text.trim()));
return;
}
System.out.println("*** Unhandled: " + text + ":" + startCount + ", " + endCount);
}
/**
* Convert a single continuous string or bold or underline
* terminators namely, the star '*' and the underscore '_'
* into a valid markup node object.
*
* @param count
*/
private void convertCharacterBlock(int count, char terminator) {
if(count <= 2) {
char[] array = new char[count];
Arrays.fill(array, terminator);
// let's fill this in the previous plain text node if present
if(root.hasChild() && (root.lastNode() instanceof PlainTextNode)) {
PlainTextNode node = (PlainTextNode) root.lastNode();
node.appendText(String.valueOf(array));
} else {
root.addChild(new PlainTextNode(root, String.valueOf(array)));
}
return;
}
// TODO: we still need to convert these terminators
System.out.println("FIX THIS!!!");
}
/**
* Clear any pending conversion and make
* it into a text node
*
*/
private void clearPending() {
if(pos > this.length) {
pos = this.length;
}
if((lastConverted + 1) <= pos) {
String text = line.substring(lastConverted, pos);
if("\n".equals(text)) {
root.addChild(new NewLineNode(root));
} else {
root.addChild(new PlainTextNode(root, text));
}
}
lastConverted = pos;
}
/**
* Test if the character at given position is same as supplied one.
*
* @param pos
* @param character
* @return
*/
private boolean charAt(int pos, char character) {
if(pos < 0) {
return false;
}
if(pos < line.length()) {
if(line.charAt(pos) == character) {
return true;
}
}
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy