
org.apache.maven.doxia.module.markdown.MarkdownParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of doxia-module-markdown Show documentation
Show all versions of doxia-module-markdown Show documentation
A Doxia module for Markdown source documents.
package org.apache.maven.doxia.module.markdown;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.doxia.markup.HtmlMarkup;
import org.apache.maven.doxia.module.xhtml.XhtmlParser;
import org.apache.maven.doxia.parser.AbstractParser;
import org.apache.maven.doxia.parser.ParseException;
import org.apache.maven.doxia.parser.Parser;
import org.apache.maven.doxia.sink.Sink;
import org.codehaus.plexus.component.annotations.Component;
import org.codehaus.plexus.component.annotations.Requirement;
import org.codehaus.plexus.util.IOUtil;
import org.codehaus.plexus.util.xml.pull.XmlPullParser;
import org.pegdown.Extensions;
import org.pegdown.PegDownProcessor;
import org.pegdown.ast.HeaderNode;
import org.pegdown.ast.HtmlBlockNode;
import org.pegdown.ast.Node;
import org.pegdown.ast.RootNode;
import org.pegdown.ast.SuperNode;
import org.pegdown.ast.TextNode;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Implementation of {@link org.apache.maven.doxia.parser.Parser} for Markdown documents.
*
* Defers effective parsing to the PegDown library, which generates HTML content
* then delegates parsing of this content to a slightly modified Doxia Xhtml parser.
*
* @author Julien Nicoulaud
* @since 1.3
* @see MarkdownToDoxiaHtmlSerializer
*/
@Component( role = Parser.class, hint = "markdown" )
public class MarkdownParser
extends AbstractParser
{
/**
* The role hint for the {@link MarkdownParser} Plexus component.
*/
public static final String ROLE_HINT = "markdown";
/**
* The {@link PegDownProcessor} used to convert Pegdown documents to HTML.
*/
protected static final PegDownProcessor PEGDOWN_PROCESSOR =
new PegDownProcessor( Extensions.ALL & ~Extensions.HARDWRAPS, Long.MAX_VALUE );
/**
* Regex that identifies a multimarkdown-style metadata section at the start of the document
*/
private static final String MULTI_MARKDOWN_METADATA_SECTION =
"^(((?:[^\\s:][^:]*):(?:.*(?:\r?\n\\p{Blank}+[^\\s].*)*\r?\n))+)(?:\\s*\r?\n)";
/**
* Regex that captures the key and value of a multimarkdown-style metadata entry.
*/
private static final String MULTI_MARKDOWN_METADATA_ENTRY =
"([^\\s:][^:]*):(.*(?:\r?\n\\p{Blank}+[^\\s].*)*)\r?\n";
/**
* In order to ensure that we have minimal risk of false positives when slurping metadata sections, the
* first key in the metadata section must be one of these standard keys or else the entire metadata section is
* ignored.
*/
private static final String[] STANDARD_METADATA_KEYS =
{ "title", "author", "date", "address", "affiliation", "copyright", "email", "keywords", "language", "phone",
"subtitle" };
public int getType()
{
return TXT_TYPE;
}
@Requirement
private PegDownHtmlParser parser;
public void parse( Reader source, Sink sink )
throws ParseException
{
try
{
// Markdown to HTML (using Pegdown library)
String html = toHtml( source );
// then HTML to Sink API
parser.parse( new StringReader( html ), sink );
}
catch ( IOException e )
{
throw new ParseException( "Failed reading Markdown source document", e );
}
}
/**
* uses PegDown library to parse content and generate HTML output.
*
* @param source the Markdown source
* @return HTML content generated by PegDown
* @throws IOException
* @see MarkdownToDoxiaHtmlSerializer
*/
private String toHtml( Reader source )
throws IOException
{
String text = IOUtil.toString( source );
StringBuilder html = new StringBuilder( text.length() * 2 );
html.append( "" );
html.append( "" );
Pattern metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_SECTION, Pattern.MULTILINE );
Matcher metadataMatcher = metadataPattern.matcher( text );
boolean haveTitle = false;
if ( metadataMatcher.find() )
{
metadataPattern = Pattern.compile( MULTI_MARKDOWN_METADATA_ENTRY, Pattern.MULTILINE );
Matcher lineMatcher = metadataPattern.matcher( metadataMatcher.group( 1 ) );
boolean first = true;
while ( lineMatcher.find() )
{
String key = StringUtils.trimToEmpty( lineMatcher.group( 1 ) );
if ( first )
{
boolean found = false;
for ( String k : STANDARD_METADATA_KEYS )
{
if ( k.equalsIgnoreCase( key ) )
{
found = true;
break;
}
}
if ( !found )
{
break;
}
first = false;
}
String value = StringUtils.trimToEmpty( lineMatcher.group( 2 ) );
if ( "title".equalsIgnoreCase( key ) )
{
haveTitle = true;
html.append( "" );
html.append( StringEscapeUtils.escapeXml( value ) );
html.append( " " );
}
else if ( "author".equalsIgnoreCase( key ) )
{
html.append( "" );
}
else if ( "date".equalsIgnoreCase( key ) )
{
html.append( "" );
}
else
{
html.append( "" );
}
}
if ( !first )
{
text = text.substring( metadataMatcher.end() );
}
}
RootNode rootNode = PEGDOWN_PROCESSOR.parseMarkdown( text.toCharArray() );
if ( !haveTitle && rootNode.getChildren().size() > 0 )
{
// use the first (non-comment) node only if it is a heading
int i = 0;
Node firstNode = null;
while ( i < rootNode.getChildren().size() && isHtmlComment(
( firstNode = rootNode.getChildren().get( i ) ) ) )
{
i++;
}
if ( firstNode instanceof HeaderNode )
{
html.append( "" );
html.append( StringEscapeUtils.escapeXml( nodeText( firstNode ) ) );
html.append( " " );
}
}
html.append( "" );
html.append( "" );
html.append( new MarkdownToDoxiaHtmlSerializer().toHtml( rootNode ) );
html.append( "" );
html.append( "" );
return html.toString();
}
public static boolean isHtmlComment( Node node )
{
if ( node instanceof HtmlBlockNode )
{
HtmlBlockNode blockNode = (HtmlBlockNode) node;
return blockNode.getText().startsWith( "